{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 2948, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 1914.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3331.0, "completions/max_terminated_length": 3331.0, "completions/mean_length": 1914.25, "completions/mean_terminated_length": 1914.25, "completions/min_length": 678.0, "completions/min_terminated_length": 678.0, "epoch": 0.00033921302578018993, "frac_reward_zero_std": 0.0, "grad_norm": 0.11923123896121979, "kl": 0.0, "learning_rate": 0.0, "loss": 0.0003, "num_tokens": 36909.0, "reward": 0.6750000715255737, "reward_std": 0.061237238347530365, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 1 }, { "completion_length": 358.5, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 358.5, "completions/mean_terminated_length": 358.5, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.0006784260515603799, "frac_reward_zero_std": 0.0, "grad_norm": 0.16774195432662964, "kl": 0.0, "learning_rate": 1e-08, "loss": -0.0001, "num_tokens": 56733.0, "reward": 1.062500238418579, "reward_std": 0.14997151494026184, "rewards/correctness_reward_func/mean": 0.8500000834465027, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.21250002086162567, "rewards/format_reward_func/std": 0.07723929733037949, "step": 2 }, { "completion_length": 1888.3334350585938, "completions/clipped_ratio": 0.0, "completions/max_length": 4456.0, "completions/max_terminated_length": 4456.0, "completions/mean_length": 1888.3333740234375, "completions/mean_terminated_length": 1888.3333740234375, "completions/min_length": 647.0, "completions/min_terminated_length": 647.0, "epoch": 0.0010176390773405698, "frac_reward_zero_std": 0.5, "grad_norm": 0.14695346355438232, "kl": 0.0, "learning_rate": 2e-08, "loss": -0.0074, "num_tokens": 89689.0, "reward": 1.133333444595337, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 3 }, { "completion_length": 2936.916748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5567.0, "completions/mean_length": 3486.0, "completions/mean_terminated_length": 3203.9091796875, "completions/min_length": 1971.0, "completions/min_terminated_length": 1971.0, "epoch": 0.0013568521031207597, "frac_reward_zero_std": 0.5, "grad_norm": 1.444371223449707, "kl": NaN, "learning_rate": 3e-08, "loss": -0.0084, "num_tokens": 135204.0, "reward": 1.0125000476837158, "reward_std": 0.31494051218032837, "rewards/correctness_reward_func/mean": 0.75, "rewards/correctness_reward_func/std": 0.45226702094078064, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.06784005463123322, "step": 4 }, { "completion_length": 2239.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 5995.0, "completions/max_terminated_length": 5995.0, "completions/mean_length": 2239.5, "completions/mean_terminated_length": 2239.5, "completions/min_length": 646.0, "completions/min_terminated_length": 646.0, "epoch": 0.0016960651289009499, "frac_reward_zero_std": 0.0, "grad_norm": 1.1211001873016357, "kl": 0.0, "learning_rate": 4e-08, "loss": 0.0407, "num_tokens": 174384.0, "reward": 0.6833333373069763, "reward_std": 0.5333091616630554, "rewards/correctness_reward_func/mean": 0.43333330750465393, "rewards/correctness_reward_func/std": 0.45792683959007263, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.07385490089654922, "step": 5 }, { "completion_length": 2380.25, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 6126.0, "completions/mean_length": 3478.416748046875, "completions/mean_terminated_length": 2856.300048828125, "completions/min_length": 702.0, "completions/min_terminated_length": 702.0, "epoch": 0.0020352781546811396, "frac_reward_zero_std": 0.5, "grad_norm": 0.597161054611206, "kl": NaN, "learning_rate": 5e-08, "loss": -0.0538, "num_tokens": 213567.0, "reward": 1.0291666984558105, "reward_std": 0.24208299815654755, "rewards/correctness_reward_func/mean": 0.7666666507720947, "rewards/correctness_reward_func/std": 0.3700941801071167, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.09323723614215851, "step": 6 }, { "completion_length": 1254.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3006.0, "completions/max_terminated_length": 3006.0, "completions/mean_length": 1254.75, "completions/mean_terminated_length": 1254.75, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "epoch": 0.0023744911804613297, "frac_reward_zero_std": 0.0, "grad_norm": 0.42129963636398315, "kl": 0.0, "learning_rate": 6e-08, "loss": -0.0015, "num_tokens": 241776.0, "reward": 0.8166667819023132, "reward_std": 0.20202915370464325, "rewards/correctness_reward_func/mean": 0.5666666626930237, "rewards/correctness_reward_func/std": 0.5033223032951355, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.07385490089654922, "step": 7 }, { "completion_length": 634.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1214.0, "completions/max_terminated_length": 1214.0, "completions/mean_length": 634.25, "completions/mean_terminated_length": 634.25, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "epoch": 0.0027137042062415195, "frac_reward_zero_std": 0.0, "grad_norm": 0.0805782824754715, "kl": 0.0, "learning_rate": 7e-08, "loss": -0.0005, "num_tokens": 262713.0, "reward": 0.21250002086162567, "reward_std": 0.06934845447540283, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.21250002086162567, "rewards/format_reward_func/std": 0.07723929733037949, "step": 8 }, { "completion_length": 1587.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2970.0, "completions/max_terminated_length": 2970.0, "completions/mean_length": 1587.166748046875, "completions/mean_terminated_length": 1587.166748046875, "completions/min_length": 496.0, "completions/min_terminated_length": 496.0, "epoch": 0.0030529172320217096, "frac_reward_zero_std": 0.5, "grad_norm": 0.6700242161750793, "kl": 0.0, "learning_rate": 8e-08, "loss": -0.0088, "num_tokens": 291287.0, "reward": 0.9000000953674316, "reward_std": 0.21908903121948242, "rewards/correctness_reward_func/mean": 0.6000000238418579, "rewards/correctness_reward_func/std": 0.36181363463401794, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 9 }, { "completion_length": 1056.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2969.0, "completions/max_terminated_length": 2969.0, "completions/mean_length": 1056.416748046875, "completions/mean_terminated_length": 1056.416748046875, "completions/min_length": 443.0, "completions/min_terminated_length": 443.0, "epoch": 0.0033921302578018998, "frac_reward_zero_std": 0.0, "grad_norm": 0.37774741649627686, "kl": 0.0, "learning_rate": 9e-08, "loss": -0.0003, "num_tokens": 314290.0, "reward": 0.949999988079071, "reward_std": 0.25819891691207886, "rewards/correctness_reward_func/mean": 0.699999988079071, "rewards/correctness_reward_func/std": 0.3357488214969635, "rewards/format_reward_func/mean": 0.2500000298023224, "rewards/format_reward_func/std": 0.07385490089654922, "step": 10 }, { "completion_length": 3576.666748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5869.0, "completions/mean_length": 4125.75, "completions/mean_terminated_length": 3901.818359375, "completions/min_length": 1444.0, "completions/min_terminated_length": 1444.0, "epoch": 0.0037313432835820895, "frac_reward_zero_std": 0.0, "grad_norm": 0.22871029376983643, "kl": NaN, "learning_rate": 1e-07, "loss": -0.017, "num_tokens": 372348.0, "reward": 0.6791667342185974, "reward_std": 0.10357433557510376, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.09323723614215851, "step": 11 }, { "completion_length": 1701.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 4018.0, "completions/max_terminated_length": 4018.0, "completions/mean_length": 1701.916748046875, "completions/mean_terminated_length": 1701.916748046875, "completions/min_length": 489.0, "completions/min_terminated_length": 489.0, "epoch": 0.004070556309362279, "frac_reward_zero_std": 0.0, "grad_norm": 0.6973493695259094, "kl": 0.0, "learning_rate": 1.0999999999999999e-07, "loss": -0.004, "num_tokens": 409031.0, "reward": 0.7041667699813843, "reward_std": 0.44965147972106934, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.23750001192092896, "rewards/format_reward_func/std": 0.07723929733037949, "step": 12 }, { "completion_length": 1128.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2031.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1128.75, "completions/mean_terminated_length": 1128.75, "completions/min_length": 639.0, "completions/min_terminated_length": 639.0, "epoch": 0.004409769335142469, "frac_reward_zero_std": 0.5, "grad_norm": 0.07278123497962952, "kl": 0.0, "learning_rate": 1.2e-07, "loss": -0.001, "num_tokens": 434738.0, "reward": 0.762499988079071, "reward_std": 0.04107918590307236, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.06784005463123322, "step": 13 }, { "completion_length": 1370.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3375.0, "completions/max_terminated_length": 3375.0, "completions/mean_length": 1370.166748046875, "completions/mean_terminated_length": 1370.166748046875, "completions/min_length": 520.0, "completions/min_terminated_length": 520.0, "epoch": 0.0047489823609226595, "frac_reward_zero_std": 0.0, "grad_norm": 0.4310975968837738, "kl": 0.0, "learning_rate": 1.3e-07, "loss": 0.0046, "num_tokens": 464644.0, "reward": 1.0541667938232422, "reward_std": 0.24682849645614624, "rewards/correctness_reward_func/mean": 0.7666666507720947, "rewards/correctness_reward_func/std": 0.2534608840942383, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 14 }, { "completion_length": 2577.416748046875, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 5694.0, "completions/mean_length": 3675.58349609375, "completions/mean_terminated_length": 3092.900146484375, "completions/min_length": 1257.0, "completions/min_terminated_length": 1257.0, "epoch": 0.00508819538670285, "frac_reward_zero_std": 0.5, "grad_norm": 0.4105437397956848, "kl": NaN, "learning_rate": 1.4e-07, "loss": -0.0759, "num_tokens": 511977.0, "reward": 0.9291667938232422, "reward_std": 0.26571446657180786, "rewards/correctness_reward_func/mean": 0.6666666865348816, "rewards/correctness_reward_func/std": 0.31139957904815674, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.09323723614215851, "step": 15 }, { "completion_length": 2431.2501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 4410.0, "completions/max_terminated_length": 4410.0, "completions/mean_length": 2431.25, "completions/mean_terminated_length": 2431.25, "completions/min_length": 1113.0, "completions/min_terminated_length": 1113.0, "epoch": 0.005427408412483039, "frac_reward_zero_std": 0.0, "grad_norm": 2.087972640991211, "kl": 0.0, "learning_rate": 1.5e-07, "loss": -0.0046, "num_tokens": 552924.0, "reward": 0.6625000238418579, "reward_std": 0.26881134510040283, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4972652792930603, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.06784005463123322, "step": 16 }, { "completion_length": 1940.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3607.0, "completions/max_terminated_length": 3607.0, "completions/mean_length": 1940.5, "completions/mean_terminated_length": 1940.5, "completions/min_length": 601.0, "completions/min_terminated_length": 601.0, "epoch": 0.005766621438263229, "frac_reward_zero_std": 0.0, "grad_norm": 0.6243698000907898, "kl": 0.0, "learning_rate": 1.6e-07, "loss": -0.0143, "num_tokens": 585378.0, "reward": 0.8791667819023132, "reward_std": 0.45129260420799255, "rewards/correctness_reward_func/mean": 0.6166666150093079, "rewards/correctness_reward_func/std": 0.37618502974510193, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.06784005463123322, "step": 17 }, { "completion_length": 913.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1744.0, "completions/max_terminated_length": 1744.0, "completions/mean_length": 913.1666870117188, "completions/mean_terminated_length": 913.1666870117188, "completions/min_length": 411.0, "completions/min_terminated_length": 411.0, "epoch": 0.006105834464043419, "frac_reward_zero_std": 0.0, "grad_norm": 0.548653244972229, "kl": 0.0, "learning_rate": 1.7000000000000001e-07, "loss": 0.012, "num_tokens": 606578.0, "reward": 1.0875000953674316, "reward_std": 0.4288218021392822, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.38138505816459656, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 18 }, { "completion_length": 859.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 3215.0, "completions/max_terminated_length": 3215.0, "completions/mean_length": 859.75, "completions/mean_terminated_length": 859.75, "completions/min_length": 374.0, "completions/min_terminated_length": 374.0, "epoch": 0.006445047489823609, "frac_reward_zero_std": 0.0, "grad_norm": 0.5170750617980957, "kl": 0.0, "learning_rate": 1.8e-07, "loss": -0.0076, "num_tokens": 627719.0, "reward": 0.8833333849906921, "reward_std": 0.30571478605270386, "rewards/correctness_reward_func/mean": 0.6333333253860474, "rewards/correctness_reward_func/std": 0.3892494738101959, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.07385490089654922, "step": 19 }, { "completion_length": 1683.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3455.0, "completions/max_terminated_length": 3455.0, "completions/mean_length": 1683.75, "completions/mean_terminated_length": 1683.75, "completions/min_length": 768.0, "completions/min_terminated_length": 768.0, "epoch": 0.0067842605156037995, "frac_reward_zero_std": 0.5, "grad_norm": 0.09887401759624481, "kl": 0.0, "learning_rate": 1.8999999999999998e-07, "loss": -0.001, "num_tokens": 657884.0, "reward": 0.762499988079071, "reward_std": 0.041079193353652954, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.06784005463123322, "step": 20 }, { "completion_length": 2319.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4392.0, "completions/max_terminated_length": 4392.0, "completions/mean_length": 2319.166748046875, "completions/mean_terminated_length": 2319.166748046875, "completions/min_length": 1144.0, "completions/min_terminated_length": 1144.0, "epoch": 0.007123473541383989, "frac_reward_zero_std": 0.0, "grad_norm": 0.4928167760372162, "kl": 0.0, "learning_rate": 2e-07, "loss": 0.0001, "num_tokens": 695722.0, "reward": 1.070833444595337, "reward_std": 0.2486901879310608, "rewards/correctness_reward_func/mean": 0.7833333015441895, "rewards/correctness_reward_func/std": 0.26227444410324097, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 21 }, { "completion_length": 723.5833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1397.0, "completions/max_terminated_length": 1397.0, "completions/mean_length": 723.5833740234375, "completions/mean_terminated_length": 723.5833740234375, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "epoch": 0.007462686567164179, "frac_reward_zero_std": 0.0, "grad_norm": 0.5503113865852356, "kl": 0.0, "learning_rate": 2.0999999999999997e-07, "loss": -0.0044, "num_tokens": 716675.0, "reward": 0.8333333730697632, "reward_std": 0.5088584423065186, "rewards/correctness_reward_func/mean": 0.5833333134651184, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.07385490089654922, "step": 22 }, { "completion_length": 1546.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3261.0, "completions/max_terminated_length": 3261.0, "completions/mean_length": 1546.5833740234375, "completions/mean_terminated_length": 1546.5833740234375, "completions/min_length": 726.0, "completions/min_terminated_length": 726.0, "epoch": 0.007801899592944369, "frac_reward_zero_std": 0.0, "grad_norm": 0.530082643032074, "kl": 0.0, "learning_rate": 2.1999999999999998e-07, "loss": -0.0047, "num_tokens": 744030.0, "reward": 0.5625, "reward_std": 0.295512855052948, "rewards/correctness_reward_func/mean": 0.29999998211860657, "rewards/correctness_reward_func/std": 0.4472135901451111, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.06784005463123322, "step": 23 }, { "completion_length": 3152.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 5800.0, "completions/max_terminated_length": 5800.0, "completions/mean_length": 3152.08349609375, "completions/mean_terminated_length": 3152.08349609375, "completions/min_length": 1237.0, "completions/min_terminated_length": 1237.0, "epoch": 0.008141112618724558, "frac_reward_zero_std": 1.0, "grad_norm": 5.056385816715192e-07, "kl": 0.0, "learning_rate": 2.3e-07, "loss": 0.0, "num_tokens": 794473.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 24 }, { "completion_length": 540.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1220.0, "completions/max_terminated_length": 1220.0, "completions/mean_length": 540.8333740234375, "completions/mean_terminated_length": 540.8333740234375, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.008480325644504749, "frac_reward_zero_std": 0.5, "grad_norm": 0.24769005179405212, "kl": 0.0, "learning_rate": 2.4e-07, "loss": -0.0003, "num_tokens": 811541.0, "reward": 0.7666667699813843, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 25 }, { "completion_length": 1279.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 2073.0, "completions/max_terminated_length": 2073.0, "completions/mean_length": 1279.666748046875, "completions/mean_terminated_length": 1279.666748046875, "completions/min_length": 647.0, "completions/min_terminated_length": 647.0, "epoch": 0.008819538670284939, "frac_reward_zero_std": 0.0, "grad_norm": 0.1748448610305786, "kl": 0.0, "learning_rate": 2.5e-07, "loss": 0.0003, "num_tokens": 836509.0, "reward": 1.1666667461395264, "reward_std": 0.09559705853462219, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 26 }, { "completion_length": 1812.5001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 2724.0, "completions/max_terminated_length": 2724.0, "completions/mean_length": 1812.5, "completions/mean_terminated_length": 1812.5, "completions/min_length": 861.0, "completions/min_terminated_length": 861.0, "epoch": 0.009158751696065129, "frac_reward_zero_std": 0.5, "grad_norm": 0.10540489852428436, "kl": 0.0, "learning_rate": 2.6e-07, "loss": -0.0019, "num_tokens": 868297.0, "reward": 1.1541666984558105, "reward_std": 0.05103101581335068, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 27 }, { "completion_length": 1048.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2830.0, "completions/max_terminated_length": 2830.0, "completions/mean_length": 1048.0833740234375, "completions/mean_terminated_length": 1048.0833740234375, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "epoch": 0.009497964721845319, "frac_reward_zero_std": 0.0, "grad_norm": 0.6303296089172363, "kl": 0.0, "learning_rate": 2.7e-07, "loss": -0.0051, "num_tokens": 891440.0, "reward": 0.9958333373069763, "reward_std": 0.41863998770713806, "rewards/correctness_reward_func/mean": 0.7333332896232605, "rewards/correctness_reward_func/std": 0.3550501763820648, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.06784005463123322, "step": 28 }, { "completion_length": 1568.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3112.0, "completions/max_terminated_length": 3112.0, "completions/mean_length": 1568.0833740234375, "completions/mean_terminated_length": 1568.0833740234375, "completions/min_length": 1012.0, "completions/min_terminated_length": 1012.0, "epoch": 0.00983717774762551, "frac_reward_zero_std": 0.0, "grad_norm": 0.7948746085166931, "kl": 0.0, "learning_rate": 2.8e-07, "loss": 0.0124, "num_tokens": 922821.0, "reward": 0.783333420753479, "reward_std": 0.45408618450164795, "rewards/correctness_reward_func/mean": 0.5333333611488342, "rewards/correctness_reward_func/std": 0.3938928246498108, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.07385490089654922, "step": 29 }, { "completion_length": 1348.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2316.0, "completions/max_terminated_length": 2316.0, "completions/mean_length": 1348.0, "completions/mean_terminated_length": 1348.0, "completions/min_length": 635.0, "completions/min_terminated_length": 635.0, "epoch": 0.0101763907734057, "frac_reward_zero_std": 0.0, "grad_norm": 0.48874858021736145, "kl": 0.0, "learning_rate": 2.9e-07, "loss": -0.0019, "num_tokens": 948873.0, "reward": 0.8166667819023132, "reward_std": 0.2557638883590698, "rewards/correctness_reward_func/mean": 0.5166666507720947, "rewards/correctness_reward_func/std": 0.4628632962703705, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 30 }, { "completion_length": 699.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1161.0, "completions/max_terminated_length": 1161.0, "completions/mean_length": 699.8333740234375, "completions/mean_terminated_length": 699.8333740234375, "completions/min_length": 345.0, "completions/min_terminated_length": 345.0, "epoch": 0.01051560379918589, "frac_reward_zero_std": 0.5, "grad_norm": 0.5726268291473389, "kl": 0.0, "learning_rate": 3e-07, "loss": -0.0029, "num_tokens": 971413.0, "reward": 0.887499988079071, "reward_std": 0.26016825437545776, "rewards/correctness_reward_func/mean": 0.699999988079071, "rewards/correctness_reward_func/std": 0.43064433336257935, "rewards/format_reward_func/mean": 0.1875, "rewards/format_reward_func/std": 0.06784005463123322, "step": 31 }, { "completion_length": 3761.58349609375, "completions/clipped_ratio": 0.0, "completions/max_length": 5482.0, "completions/max_terminated_length": 5482.0, "completions/mean_length": 3761.58349609375, "completions/mean_terminated_length": 3761.58349609375, "completions/min_length": 1546.0, "completions/min_terminated_length": 1546.0, "epoch": 0.010854816824966078, "frac_reward_zero_std": 0.0, "grad_norm": 1.1386401653289795, "kl": 0.0, "learning_rate": 3.1e-07, "loss": -0.0138, "num_tokens": 1028342.0, "reward": 0.7250000834465027, "reward_std": 0.4932064414024353, "rewards/correctness_reward_func/mean": 0.44999995827674866, "rewards/correctness_reward_func/std": 0.47577691078186035, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 32 }, { "completion_length": 1955.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3581.0, "completions/max_terminated_length": 3581.0, "completions/mean_length": 1955.3333740234375, "completions/mean_terminated_length": 1955.3333740234375, "completions/min_length": 715.0, "completions/min_terminated_length": 715.0, "epoch": 0.011194029850746268, "frac_reward_zero_std": 0.5, "grad_norm": 0.5336142778396606, "kl": 0.0, "learning_rate": 3.2e-07, "loss": 0.0114, "num_tokens": 1064280.0, "reward": 1.0375001430511475, "reward_std": 0.20600366592407227, "rewards/correctness_reward_func/mean": 0.75, "rewards/correctness_reward_func/std": 0.24308621883392334, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 33 }, { "completion_length": 2596.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 5696.0, "completions/max_terminated_length": 5696.0, "completions/mean_length": 2596.08349609375, "completions/mean_terminated_length": 2596.08349609375, "completions/min_length": 734.0, "completions/min_terminated_length": 734.0, "epoch": 0.011533242876526458, "frac_reward_zero_std": 0.5, "grad_norm": 0.12288849800825119, "kl": 0.0, "learning_rate": 3.3e-07, "loss": 0.0018, "num_tokens": 1106305.0, "reward": 0.6750000715255737, "reward_std": 0.03872983902692795, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 34 }, { "completion_length": 762.0833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 931.0, "completions/max_terminated_length": 931.0, "completions/mean_length": 762.0833740234375, "completions/mean_terminated_length": 762.0833740234375, "completions/min_length": 600.0, "completions/min_terminated_length": 600.0, "epoch": 0.011872455902306648, "frac_reward_zero_std": 0.0, "grad_norm": 0.3861258625984192, "kl": 0.0, "learning_rate": 3.4000000000000003e-07, "loss": -0.0014, "num_tokens": 1127090.0, "reward": 0.595833420753479, "reward_std": 0.23264777660369873, "rewards/correctness_reward_func/mean": 0.3333333432674408, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.06784005463123322, "step": 35 }, { "completion_length": 1244.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2654.0, "completions/max_terminated_length": 2654.0, "completions/mean_length": 1244.5, "completions/mean_terminated_length": 1244.5, "completions/min_length": 488.0, "completions/min_terminated_length": 488.0, "epoch": 0.012211668928086838, "frac_reward_zero_std": 0.5, "grad_norm": 0.5422102212905884, "kl": 0.0, "learning_rate": 3.5e-07, "loss": 0.0022, "num_tokens": 1153376.0, "reward": 0.8583333492279053, "reward_std": 0.21946904063224792, "rewards/correctness_reward_func/mean": 0.5833333134651184, "rewards/correctness_reward_func/std": 0.5149286389350891, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 36 }, { "completion_length": 1497.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3081.0, "completions/max_terminated_length": 3081.0, "completions/mean_length": 1497.5833740234375, "completions/mean_terminated_length": 1497.5833740234375, "completions/min_length": 513.0, "completions/min_terminated_length": 513.0, "epoch": 0.012550881953867029, "frac_reward_zero_std": 0.5, "grad_norm": 0.6115618348121643, "kl": 0.0, "learning_rate": 3.6e-07, "loss": 0.0252, "num_tokens": 1183983.0, "reward": 0.9916667938232422, "reward_std": 0.2866472899913788, "rewards/correctness_reward_func/mean": 0.7166666984558105, "rewards/correctness_reward_func/std": 0.3459725081920624, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 37 }, { "completion_length": 1688.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 5040.0, "completions/max_terminated_length": 5040.0, "completions/mean_length": 1688.75, "completions/mean_terminated_length": 1688.75, "completions/min_length": 436.0, "completions/min_terminated_length": 436.0, "epoch": 0.012890094979647219, "frac_reward_zero_std": 0.5, "grad_norm": 0.9230490922927856, "kl": 0.0, "learning_rate": 3.7e-07, "loss": -0.005, "num_tokens": 1215972.0, "reward": 0.5, "reward_std": 0.21908903121948242, "rewards/correctness_reward_func/mean": 0.20000000298023224, "rewards/correctness_reward_func/std": 0.36181363463401794, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 38 }, { "completion_length": 1880.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3490.0, "completions/max_terminated_length": 3490.0, "completions/mean_length": 1880.5833740234375, "completions/mean_terminated_length": 1880.5833740234375, "completions/min_length": 737.0, "completions/min_terminated_length": 737.0, "epoch": 0.013229308005427409, "frac_reward_zero_std": 0.5, "grad_norm": 0.08842471987009048, "kl": 0.0, "learning_rate": 3.7999999999999996e-07, "loss": 0.0011, "num_tokens": 1250587.0, "reward": 1.1875, "reward_std": 0.03061862848699093, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 39 }, { "completion_length": 1623.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3097.0, "completions/max_terminated_length": 3097.0, "completions/mean_length": 1623.5833740234375, "completions/mean_terminated_length": 1623.5833740234375, "completions/min_length": 996.0, "completions/min_terminated_length": 996.0, "epoch": 0.013568521031207599, "frac_reward_zero_std": 0.0, "grad_norm": 0.7013817429542542, "kl": 0.0, "learning_rate": 3.8999999999999997e-07, "loss": 0.0118, "num_tokens": 1284230.0, "reward": 1.0250000953674316, "reward_std": 0.42866072058677673, "rewards/correctness_reward_func/mean": 0.75, "rewards/correctness_reward_func/std": 0.36306774616241455, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 40 }, { "completion_length": 1975.3333740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 3728.0, "completions/mean_length": 2524.416748046875, "completions/mean_terminated_length": 2154.9091796875, "completions/min_length": 724.0, "completions/min_terminated_length": 724.0, "epoch": 0.013907734056987787, "frac_reward_zero_std": 0.0, "grad_norm": 0.6931108236312866, "kl": NaN, "learning_rate": 4e-07, "loss": -0.0256, "num_tokens": 1318170.0, "reward": 0.7166666984558105, "reward_std": 0.4313082695007324, "rewards/correctness_reward_func/mean": 0.46666669845581055, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.07385490089654922, "step": 41 }, { "completion_length": 1261.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 2707.0, "completions/max_terminated_length": 2707.0, "completions/mean_length": 1261.416748046875, "completions/mean_terminated_length": 1261.416748046875, "completions/min_length": 644.0, "completions/min_terminated_length": 644.0, "epoch": 0.014246947082767978, "frac_reward_zero_std": 0.5, "grad_norm": 0.532822847366333, "kl": 0.0, "learning_rate": 4.0999999999999994e-07, "loss": 0.0143, "num_tokens": 1341893.0, "reward": 1.0875000953674316, "reward_std": 0.23008152842521667, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.2696799635887146, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 42 }, { "completion_length": 1444.25, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4008.0, "completions/mean_length": 1993.3333740234375, "completions/mean_terminated_length": 1575.5455322265625, "completions/min_length": 525.0, "completions/min_terminated_length": 525.0, "epoch": 0.014586160108548168, "frac_reward_zero_std": 0.5, "grad_norm": 0.7497459650039673, "kl": NaN, "learning_rate": 4.1999999999999995e-07, "loss": -0.0279, "num_tokens": 1370816.0, "reward": 1.008333444595337, "reward_std": 0.22453658282756805, "rewards/correctness_reward_func/mean": 0.7333333492279053, "rewards/correctness_reward_func/std": 0.23094011843204498, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 43 }, { "completion_length": 1885.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4084.0, "completions/max_terminated_length": 4084.0, "completions/mean_length": 1885.0833740234375, "completions/mean_terminated_length": 1885.0833740234375, "completions/min_length": 452.0, "completions/min_terminated_length": 452.0, "epoch": 0.014925373134328358, "frac_reward_zero_std": 0.5, "grad_norm": 0.11372081935405731, "kl": 0.0, "learning_rate": 4.2999999999999996e-07, "loss": -0.0021, "num_tokens": 1403379.0, "reward": 0.6625000834465027, "reward_std": 0.041079193353652954, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.06784005463123322, "step": 44 }, { "completion_length": 967.0000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 2843.0, "completions/max_terminated_length": 2843.0, "completions/mean_length": 967.0, "completions/mean_terminated_length": 967.0, "completions/min_length": 478.0, "completions/min_terminated_length": 478.0, "epoch": 0.015264586160108548, "frac_reward_zero_std": 0.0, "grad_norm": 0.4614796042442322, "kl": 0.0, "learning_rate": 4.3999999999999997e-07, "loss": -0.0099, "num_tokens": 1427145.0, "reward": 0.5708333849906921, "reward_std": 0.23264777660369873, "rewards/correctness_reward_func/mean": 0.3333333432674408, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.23750001192092896, "rewards/format_reward_func/std": 0.07723929733037949, "step": 45 }, { "completion_length": 1987.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 5091.0, "completions/max_terminated_length": 5091.0, "completions/mean_length": 1987.5833740234375, "completions/mean_terminated_length": 1987.5833740234375, "completions/min_length": 601.0, "completions/min_terminated_length": 601.0, "epoch": 0.015603799185888738, "frac_reward_zero_std": 0.5, "grad_norm": 0.45858681201934814, "kl": 0.0, "learning_rate": 4.5e-07, "loss": -0.0061, "num_tokens": 1462708.0, "reward": 1.133333444595337, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.2806917726993561, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 46 }, { "completion_length": 2157.0001220703125, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 3806.0, "completions/mean_length": 2706.08349609375, "completions/mean_terminated_length": 2353.091064453125, "completions/min_length": 1247.0, "completions/min_terminated_length": 1247.0, "epoch": 0.01594301221166893, "frac_reward_zero_std": 0.0, "grad_norm": 0.5918885469436646, "kl": NaN, "learning_rate": 4.6e-07, "loss": -0.0191, "num_tokens": 1500250.0, "reward": 0.9625000953674316, "reward_std": 0.4404165744781494, "rewards/correctness_reward_func/mean": 0.699999988079071, "rewards/correctness_reward_func/std": 0.3357488214969635, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.09323723614215851, "step": 47 }, { "completion_length": 2361.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 6359.0, "completions/max_terminated_length": 6359.0, "completions/mean_length": 2361.166748046875, "completions/mean_terminated_length": 2361.166748046875, "completions/min_length": 431.0, "completions/min_terminated_length": 431.0, "epoch": 0.016282225237449117, "frac_reward_zero_std": 0.0, "grad_norm": 8.699708938598633, "kl": 0.0, "learning_rate": 4.6999999999999995e-07, "loss": -0.0034, "num_tokens": 1538424.0, "reward": 0.6791667342185974, "reward_std": 0.08190402388572693, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.06784005463123322, "step": 48 }, { "completion_length": 3042.5833740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6276.0, "completions/mean_length": 3591.666748046875, "completions/mean_terminated_length": 3319.181884765625, "completions/min_length": 962.0, "completions/min_terminated_length": 962.0, "epoch": 0.01662143826322931, "frac_reward_zero_std": 0.0, "grad_norm": 0.9630971550941467, "kl": NaN, "learning_rate": 4.8e-07, "loss": -0.0544, "num_tokens": 1590673.0, "reward": 0.7124999761581421, "reward_std": 0.49692243337631226, "rewards/correctness_reward_func/mean": 0.45000001788139343, "rewards/correctness_reward_func/std": 0.47577688097953796, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.06784005463123322, "step": 49 }, { "completion_length": 930.0000305175781, "completions/clipped_ratio": 0.5, "completions/max_length": 6589.0, "completions/max_terminated_length": 2777.0, "completions/mean_length": 4224.5, "completions/mean_terminated_length": 1860.0, "completions/min_length": 740.0, "completions/min_terminated_length": 740.0, "epoch": 0.016960651289009497, "frac_reward_zero_std": 0.5, "grad_norm": 0.09431289881467819, "kl": NaN, "learning_rate": 4.9e-07, "loss": -0.0007, "num_tokens": 1614277.0, "reward": 0.5666667222976685, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.15000000596046448, "rewards/format_reward_func/std": 0.15666989982128143, "step": 50 }, { "completion_length": 1810.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3579.0, "completions/max_terminated_length": 3579.0, "completions/mean_length": 1810.25, "completions/mean_terminated_length": 1810.25, "completions/min_length": 668.0, "completions/min_terminated_length": 668.0, "epoch": 0.01729986431478969, "frac_reward_zero_std": 0.0, "grad_norm": 0.6033844947814941, "kl": 0.0, "learning_rate": 5e-07, "loss": 0.0042, "num_tokens": 1650094.0, "reward": 0.9291666746139526, "reward_std": 0.38409334421157837, "rewards/correctness_reward_func/mean": 0.6666666865348816, "rewards/correctness_reward_func/std": 0.31139957904815674, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.06784005463123322, "step": 51 }, { "completion_length": 3461.2501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 5404.0, "completions/max_terminated_length": 5404.0, "completions/mean_length": 3461.25, "completions/mean_terminated_length": 3461.25, "completions/min_length": 2344.0, "completions/min_terminated_length": 2344.0, "epoch": 0.017639077340569877, "frac_reward_zero_std": 0.0, "grad_norm": 0.9066393971443176, "kl": 0.0, "learning_rate": 4.998274672187715e-07, "loss": 0.0169, "num_tokens": 1707199.0, "reward": 0.8625000715255737, "reward_std": 0.43920692801475525, "rewards/correctness_reward_func/mean": 0.6000000238418579, "rewards/correctness_reward_func/std": 0.36181360483169556, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.06784005463123322, "step": 52 }, { "completion_length": 621.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1293.0, "completions/max_terminated_length": 1293.0, "completions/mean_length": 621.6666870117188, "completions/mean_terminated_length": 621.6666870117188, "completions/min_length": 366.0, "completions/min_terminated_length": 366.0, "epoch": 0.01797829036635007, "frac_reward_zero_std": 0.0, "grad_norm": 0.07819852232933044, "kl": 0.0, "learning_rate": 4.996549344375431e-07, "loss": -0.0013, "num_tokens": 1725603.0, "reward": 0.7124999761581421, "reward_std": 0.06934843957424164, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.21250002086162567, "rewards/format_reward_func/std": 0.07723929733037949, "step": 53 }, { "completion_length": 2232.0000610351562, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 5477.0, "completions/mean_length": 3330.166748046875, "completions/mean_terminated_length": 2678.400146484375, "completions/min_length": 944.0, "completions/min_terminated_length": 944.0, "epoch": 0.018317503392130258, "frac_reward_zero_std": 0.0, "grad_norm": 0.5813143849372864, "kl": NaN, "learning_rate": 4.994824016563146e-07, "loss": -0.0049, "num_tokens": 1765119.0, "reward": 0.5583333373069763, "reward_std": 0.23038136959075928, "rewards/correctness_reward_func/mean": 0.3333333432674408, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.2250000238418579, "rewards/format_reward_func/std": 0.11965861171483994, "step": 54 }, { "completion_length": 529.9166717529297, "completions/clipped_ratio": 0.0, "completions/max_length": 700.0, "completions/max_terminated_length": 700.0, "completions/mean_length": 529.9166870117188, "completions/mean_terminated_length": 529.9166870117188, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 0.018656716417910446, "frac_reward_zero_std": 0.5, "grad_norm": 0.04094817116856575, "kl": 0.0, "learning_rate": 4.993098688750863e-07, "loss": 0.0001, "num_tokens": 1786892.0, "reward": 1.1375000476837158, "reward_std": 0.030618613585829735, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.23750001192092896, "rewards/format_reward_func/std": 0.07723929733037949, "step": 55 }, { "completion_length": 2408.58349609375, "completions/clipped_ratio": 0.0, "completions/max_length": 4855.0, "completions/max_terminated_length": 4855.0, "completions/mean_length": 2408.58349609375, "completions/mean_terminated_length": 2408.58349609375, "completions/min_length": 1078.0, "completions/min_terminated_length": 1078.0, "epoch": 0.018995929443690638, "frac_reward_zero_std": 0.5, "grad_norm": 0.11791203916072845, "kl": 0.0, "learning_rate": 4.991373360938578e-07, "loss": 0.0014, "num_tokens": 1823361.0, "reward": 0.7875000834465027, "reward_std": 0.03061862289905548, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 56 }, { "completion_length": 499.4166717529297, "completions/clipped_ratio": 0.0, "completions/max_length": 686.0, "completions/max_terminated_length": 686.0, "completions/mean_length": 499.41668701171875, "completions/mean_terminated_length": 499.41668701171875, "completions/min_length": 378.0, "completions/min_terminated_length": 378.0, "epoch": 0.019335142469470826, "frac_reward_zero_std": 1.0, "grad_norm": 2.9338906415432575e-07, "kl": 0.0, "learning_rate": 4.989648033126294e-07, "loss": 0.0, "num_tokens": 1837562.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 57 }, { "completion_length": 1457.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3312.0, "completions/max_terminated_length": 3312.0, "completions/mean_length": 1457.5, "completions/mean_terminated_length": 1457.5, "completions/min_length": 528.0, "completions/min_terminated_length": 528.0, "epoch": 0.01967435549525102, "frac_reward_zero_std": 0.5, "grad_norm": 0.41315922141075134, "kl": 0.0, "learning_rate": 4.98792270531401e-07, "loss": 0.0077, "num_tokens": 1867334.0, "reward": 0.9541667699813843, "reward_std": 0.22716552019119263, "rewards/correctness_reward_func/mean": 0.6666666865348816, "rewards/correctness_reward_func/std": 0.31139957904815674, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 58 }, { "completion_length": 2239.666748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4822.0, "completions/mean_length": 2788.75, "completions/mean_terminated_length": 2443.272705078125, "completions/min_length": 1249.0, "completions/min_terminated_length": 1249.0, "epoch": 0.020013568521031207, "frac_reward_zero_std": 0.0, "grad_norm": 0.6937410831451416, "kl": NaN, "learning_rate": 4.986197377501725e-07, "loss": -0.0354, "num_tokens": 1910866.0, "reward": 0.6125000715255737, "reward_std": 0.26673299074172974, "rewards/correctness_reward_func/mean": 0.3500000238418579, "rewards/correctness_reward_func/std": 0.4358898997306824, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.09323723614215851, "step": 59 }, { "completion_length": 740.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1572.0, "completions/max_terminated_length": 1572.0, "completions/mean_length": 740.25, "completions/mean_terminated_length": 740.25, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "epoch": 0.0203527815468114, "frac_reward_zero_std": 0.0, "grad_norm": 0.09556394070386887, "kl": 0.0, "learning_rate": 4.984472049689441e-07, "loss": -0.0012, "num_tokens": 1933849.0, "reward": 1.2208333015441895, "reward_std": 0.07144342362880707, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.23750001192092896, "rewards/format_reward_func/std": 0.07723929733037949, "step": 60 }, { "completion_length": 546.4166717529297, "completions/clipped_ratio": 0.5, "completions/max_length": 6589.0, "completions/max_terminated_length": 1595.0, "completions/mean_length": 3840.916748046875, "completions/mean_terminated_length": 1092.8333740234375, "completions/min_length": 628.0, "completions/min_terminated_length": 628.0, "epoch": 0.020691994572591587, "frac_reward_zero_std": 0.0, "grad_norm": 0.37896645069122314, "kl": NaN, "learning_rate": 4.982746721877156e-07, "loss": 0.006, "num_tokens": 1950060.0, "reward": 0.4833333492279053, "reward_std": 0.22453658282756805, "rewards/correctness_reward_func/mean": 0.3333333432674408, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.15000000596046448, "rewards/format_reward_func/std": 0.1430193930864334, "step": 61 }, { "completion_length": 958.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1381.0, "completions/max_terminated_length": 1381.0, "completions/mean_length": 958.3333740234375, "completions/mean_terminated_length": 958.3333740234375, "completions/min_length": 633.0, "completions/min_terminated_length": 633.0, "epoch": 0.02103120759837178, "frac_reward_zero_std": 0.5, "grad_norm": 0.059960030019283295, "kl": 0.0, "learning_rate": 4.981021394064872e-07, "loss": 0.0007, "num_tokens": 1975876.0, "reward": 0.6875001788139343, "reward_std": 0.03061862289905548, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 62 }, { "completion_length": 1011.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2134.0, "completions/max_terminated_length": 2134.0, "completions/mean_length": 1011.5, "completions/mean_terminated_length": 1011.5, "completions/min_length": 520.0, "completions/min_terminated_length": 520.0, "epoch": 0.021370420624151967, "frac_reward_zero_std": 1.0, "grad_norm": 3.14549453150903e-07, "kl": 0.0, "learning_rate": 4.979296066252588e-07, "loss": 0.0, "num_tokens": 2003572.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 63 }, { "completion_length": 2115.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4262.0, "completions/max_terminated_length": 4262.0, "completions/mean_length": 2115.58349609375, "completions/mean_terminated_length": 2115.58349609375, "completions/min_length": 690.0, "completions/min_terminated_length": 690.0, "epoch": 0.021709633649932156, "frac_reward_zero_std": 1.0, "grad_norm": 2.780624583920144e-07, "kl": 0.0, "learning_rate": 4.977570738440303e-07, "loss": 0.0, "num_tokens": 2040947.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 64 }, { "completion_length": 5106.33349609375, "completions/clipped_ratio": 0.0, "completions/max_length": 6488.0, "completions/max_terminated_length": 6488.0, "completions/mean_length": 5106.33349609375, "completions/mean_terminated_length": 5106.33349609375, "completions/min_length": 2990.0, "completions/min_terminated_length": 2990.0, "epoch": 0.022048846675712348, "frac_reward_zero_std": 0.0, "grad_norm": 1.3122785091400146, "kl": 0.0, "learning_rate": 4.975845410628019e-07, "loss": -0.008, "num_tokens": 2112147.0, "reward": 0.7666666507720947, "reward_std": 0.521875262260437, "rewards/correctness_reward_func/mean": 0.5166666507720947, "rewards/correctness_reward_func/std": 0.4628632962703705, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.07385490089654922, "step": 65 }, { "completion_length": 2463.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4120.0, "completions/max_terminated_length": 4120.0, "completions/mean_length": 2463.916748046875, "completions/mean_terminated_length": 2463.916748046875, "completions/min_length": 899.0, "completions/min_terminated_length": 899.0, "epoch": 0.022388059701492536, "frac_reward_zero_std": 0.0, "grad_norm": 0.8168641924858093, "kl": 0.0, "learning_rate": 4.974120082815735e-07, "loss": -0.0017, "num_tokens": 2156594.0, "reward": 0.6208333969116211, "reward_std": 0.44094666838645935, "rewards/correctness_reward_func/mean": 0.3333333432674408, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 66 }, { "completion_length": 2151.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4150.0, "completions/max_terminated_length": 4150.0, "completions/mean_length": 2151.166748046875, "completions/mean_terminated_length": 2151.166748046875, "completions/min_length": 526.0, "completions/min_terminated_length": 526.0, "epoch": 0.022727272727272728, "frac_reward_zero_std": 0.0, "grad_norm": 0.6010086536407471, "kl": 0.0, "learning_rate": 4.97239475500345e-07, "loss": -0.0047, "num_tokens": 2194228.0, "reward": 1.0, "reward_std": 0.25819891691207886, "rewards/correctness_reward_func/mean": 0.699999988079071, "rewards/correctness_reward_func/std": 0.3357488214969635, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 67 }, { "completion_length": 690.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1049.0, "completions/max_terminated_length": 1049.0, "completions/mean_length": 690.75, "completions/mean_terminated_length": 690.75, "completions/min_length": 417.0, "completions/min_terminated_length": 417.0, "epoch": 0.023066485753052916, "frac_reward_zero_std": 0.5, "grad_norm": 0.03490245342254639, "kl": 0.0, "learning_rate": 4.970669427191166e-07, "loss": -0.0001, "num_tokens": 2215375.0, "reward": 0.6875001788139343, "reward_std": 0.03061862289905548, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 68 }, { "completion_length": 2392.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4364.0, "completions/max_terminated_length": 4364.0, "completions/mean_length": 2392.83349609375, "completions/mean_terminated_length": 2392.83349609375, "completions/min_length": 1438.0, "completions/min_terminated_length": 1438.0, "epoch": 0.023405698778833108, "frac_reward_zero_std": 0.0, "grad_norm": 0.14046727120876312, "kl": 0.0, "learning_rate": 4.968944099378881e-07, "loss": 0.0027, "num_tokens": 2255921.0, "reward": 0.6916667819023132, "reward_std": 0.07955466210842133, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 69 }, { "completion_length": 3071.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 6239.0, "completions/max_terminated_length": 6239.0, "completions/mean_length": 3071.83349609375, "completions/mean_terminated_length": 3071.83349609375, "completions/min_length": 683.0, "completions/min_terminated_length": 683.0, "epoch": 0.023744911804613297, "frac_reward_zero_std": 0.5, "grad_norm": 0.15312537550926208, "kl": 0.0, "learning_rate": 4.967218771566598e-07, "loss": -0.0006, "num_tokens": 2304747.0, "reward": 0.27500003576278687, "reward_std": 0.03872983902692795, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 70 }, { "completion_length": 2039.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4137.0, "completions/max_terminated_length": 4137.0, "completions/mean_length": 2039.166748046875, "completions/mean_terminated_length": 2039.166748046875, "completions/min_length": 865.0, "completions/min_terminated_length": 865.0, "epoch": 0.02408412483039349, "frac_reward_zero_std": 0.0, "grad_norm": 0.6391112208366394, "kl": 0.0, "learning_rate": 4.965493443754314e-07, "loss": 0.0104, "num_tokens": 2340371.0, "reward": 1.1375000476837158, "reward_std": 0.2848537564277649, "rewards/correctness_reward_func/mean": 0.8499999642372131, "rewards/correctness_reward_func/std": 0.28444522619247437, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 71 }, { "completion_length": 2516.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4060.0, "completions/max_terminated_length": 4060.0, "completions/mean_length": 2516.08349609375, "completions/mean_terminated_length": 2516.08349609375, "completions/min_length": 964.0, "completions/min_terminated_length": 964.0, "epoch": 0.024423337856173677, "frac_reward_zero_std": 0.0, "grad_norm": 0.47134825587272644, "kl": 0.0, "learning_rate": 4.963768115942029e-07, "loss": -0.0085, "num_tokens": 2383092.0, "reward": 0.8416666984558105, "reward_std": 0.20202915370464325, "rewards/correctness_reward_func/mean": 0.5666666626930237, "rewards/correctness_reward_func/std": 0.5033223032951355, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 72 }, { "completion_length": 2631.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 5089.0, "completions/max_terminated_length": 5089.0, "completions/mean_length": 2631.83349609375, "completions/mean_terminated_length": 2631.83349609375, "completions/min_length": 1202.0, "completions/min_terminated_length": 1202.0, "epoch": 0.024762550881953865, "frac_reward_zero_std": 0.0, "grad_norm": 1.5083867311477661, "kl": 0.0, "learning_rate": 4.962042788129745e-07, "loss": 0.0137, "num_tokens": 2426014.0, "reward": 0.9833334684371948, "reward_std": 0.2473839521408081, "rewards/correctness_reward_func/mean": 0.6833333969116211, "rewards/correctness_reward_func/std": 0.32427075505256653, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 73 }, { "completion_length": 1854.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2987.0, "completions/max_terminated_length": 2987.0, "completions/mean_length": 1854.166748046875, "completions/mean_terminated_length": 1854.166748046875, "completions/min_length": 779.0, "completions/min_terminated_length": 779.0, "epoch": 0.025101763907734057, "frac_reward_zero_std": 0.5, "grad_norm": 0.07933083176612854, "kl": 0.0, "learning_rate": 4.96031746031746e-07, "loss": 0.001, "num_tokens": 2461752.0, "reward": 0.2875000238418579, "reward_std": 0.03061862289905548, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 74 }, { "completion_length": 1426.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3497.0, "completions/max_terminated_length": 3497.0, "completions/mean_length": 1426.666748046875, "completions/mean_terminated_length": 1426.666748046875, "completions/min_length": 627.0, "completions/min_terminated_length": 627.0, "epoch": 0.025440976933514246, "frac_reward_zero_std": 1.0, "grad_norm": 2.802689778036438e-07, "kl": 0.0, "learning_rate": 4.958592132505176e-07, "loss": 0.0, "num_tokens": 2487038.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 75 }, { "completion_length": 1519.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3167.0, "completions/max_terminated_length": 3167.0, "completions/mean_length": 1519.916748046875, "completions/mean_terminated_length": 1519.916748046875, "completions/min_length": 605.0, "completions/min_terminated_length": 605.0, "epoch": 0.025780189959294438, "frac_reward_zero_std": 0.5, "grad_norm": 0.08159344643354416, "kl": 0.0, "learning_rate": 4.956866804692891e-07, "loss": 0.0011, "num_tokens": 2513005.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 76 }, { "completion_length": 1891.0834350585938, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 3705.0, "completions/mean_length": 2440.166748046875, "completions/mean_terminated_length": 2063.0, "completions/min_length": 823.0, "completions/min_terminated_length": 823.0, "epoch": 0.026119402985074626, "frac_reward_zero_std": 0.5, "grad_norm": 0.5463436245918274, "kl": NaN, "learning_rate": 4.955141476880607e-07, "loss": -0.014, "num_tokens": 2549246.0, "reward": 0.8500000834465027, "reward_std": 0.2752271890640259, "rewards/correctness_reward_func/mean": 0.5999999642372131, "rewards/correctness_reward_func/std": 0.36181363463401794, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.09770084172487259, "step": 77 }, { "completion_length": 2267.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4360.0, "completions/max_terminated_length": 4360.0, "completions/mean_length": 2267.58349609375, "completions/mean_terminated_length": 2267.58349609375, "completions/min_length": 943.0, "completions/min_terminated_length": 943.0, "epoch": 0.026458616010854818, "frac_reward_zero_std": 0.0, "grad_norm": 0.3025602400302887, "kl": 0.0, "learning_rate": 4.953416149068323e-07, "loss": 0.0118, "num_tokens": 2588073.0, "reward": 1.1583333015441895, "reward_std": 0.10206204652786255, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 78 }, { "completion_length": 1166.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 4041.0, "completions/max_terminated_length": 4041.0, "completions/mean_length": 1166.0, "completions/mean_terminated_length": 1166.0, "completions/min_length": 448.0, "completions/min_terminated_length": 448.0, "epoch": 0.026797829036635006, "frac_reward_zero_std": 0.5, "grad_norm": 0.4117281138896942, "kl": 0.0, "learning_rate": 4.951690821256038e-07, "loss": 0.0011, "num_tokens": 2612667.0, "reward": 1.0208334922790527, "reward_std": 0.19391795992851257, "rewards/correctness_reward_func/mean": 0.7333333492279053, "rewards/correctness_reward_func/std": 0.23094011843204498, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 79 }, { "completion_length": 2583.83349609375, "completions/clipped_ratio": 0.0, "completions/max_length": 4004.0, "completions/max_terminated_length": 4004.0, "completions/mean_length": 2583.83349609375, "completions/mean_terminated_length": 2583.83349609375, "completions/min_length": 1107.0, "completions/min_terminated_length": 1107.0, "epoch": 0.027137042062415198, "frac_reward_zero_std": 0.0, "grad_norm": 0.5607545971870422, "kl": 0.0, "learning_rate": 4.949965493443754e-07, "loss": 0.0271, "num_tokens": 2658175.0, "reward": 1.0, "reward_std": 0.2581988573074341, "rewards/correctness_reward_func/mean": 0.7000000476837158, "rewards/correctness_reward_func/std": 0.3357488214969635, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 80 }, { "completion_length": 1255.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 3033.0, "completions/max_terminated_length": 3033.0, "completions/mean_length": 1255.3333740234375, "completions/mean_terminated_length": 1255.3333740234375, "completions/min_length": 460.0, "completions/min_terminated_length": 460.0, "epoch": 0.027476255088195387, "frac_reward_zero_std": 0.5, "grad_norm": 0.32516950368881226, "kl": 0.0, "learning_rate": 4.94824016563147e-07, "loss": -0.0018, "num_tokens": 2681411.0, "reward": 1.133333444595337, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 81 }, { "completion_length": 2111.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 5036.0, "completions/max_terminated_length": 5036.0, "completions/mean_length": 2111.25, "completions/mean_terminated_length": 2111.25, "completions/min_length": 456.0, "completions/min_terminated_length": 456.0, "epoch": 0.027815468113975575, "frac_reward_zero_std": 0.0, "grad_norm": 0.35042816400527954, "kl": 0.0, "learning_rate": 4.946514837819185e-07, "loss": 0.0032, "num_tokens": 2717276.0, "reward": 1.0916666984558105, "reward_std": 0.25535523891448975, "rewards/correctness_reward_func/mean": 0.8166666030883789, "rewards/correctness_reward_func/std": 0.27579087018966675, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 82 }, { "completion_length": 2550.166748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5000.0, "completions/mean_length": 3099.25, "completions/mean_terminated_length": 2782.0, "completions/min_length": 1123.0, "completions/min_terminated_length": 1123.0, "epoch": 0.028154681139755767, "frac_reward_zero_std": 0.0, "grad_norm": 1.0620650053024292, "kl": NaN, "learning_rate": 4.944789510006901e-07, "loss": -0.0267, "num_tokens": 2759404.0, "reward": 0.7916666865348816, "reward_std": 0.4943132996559143, "rewards/correctness_reward_func/mean": 0.5166666507720947, "rewards/correctness_reward_func/std": 0.4628632962703705, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 83 }, { "completion_length": 2253.5001220703125, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4483.0, "completions/mean_length": 2802.58349609375, "completions/mean_terminated_length": 2458.36376953125, "completions/min_length": 885.0, "completions/min_terminated_length": 885.0, "epoch": 0.028493894165535955, "frac_reward_zero_std": 0.5, "grad_norm": 0.1895497888326645, "kl": NaN, "learning_rate": 4.943064182194616e-07, "loss": -0.0107, "num_tokens": 2801404.0, "reward": 0.6750000715255737, "reward_std": 0.06123724579811096, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 84 }, { "completion_length": 2357.0833740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4887.0, "completions/mean_length": 2906.166748046875, "completions/mean_terminated_length": 2571.36376953125, "completions/min_length": 1036.0, "completions/min_terminated_length": 1036.0, "epoch": 0.028833107191316147, "frac_reward_zero_std": 0.0, "grad_norm": 0.8943297863006592, "kl": NaN, "learning_rate": 4.941338854382333e-07, "loss": -0.0042, "num_tokens": 2841851.0, "reward": 0.9291667342185974, "reward_std": 0.4828321933746338, "rewards/correctness_reward_func/mean": 0.6666666865348816, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.09323723614215851, "step": 85 }, { "completion_length": 2335.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 4581.0, "completions/max_terminated_length": 4581.0, "completions/mean_length": 2335.75, "completions/mean_terminated_length": 2335.75, "completions/min_length": 1136.0, "completions/min_terminated_length": 1136.0, "epoch": 0.029172320217096336, "frac_reward_zero_std": 0.5, "grad_norm": 0.6140693426132202, "kl": 0.0, "learning_rate": 4.939613526570047e-07, "loss": -0.0082, "num_tokens": 2883254.0, "reward": 1.1041667461395264, "reward_std": 0.2685222029685974, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.3857303261756897, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 86 }, { "completion_length": 1397.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3341.0, "completions/max_terminated_length": 3341.0, "completions/mean_length": 1397.666748046875, "completions/mean_terminated_length": 1397.666748046875, "completions/min_length": 530.0, "completions/min_terminated_length": 530.0, "epoch": 0.029511533242876527, "frac_reward_zero_std": 0.0, "grad_norm": 0.753872275352478, "kl": 0.0, "learning_rate": 4.937888198757764e-07, "loss": -0.032, "num_tokens": 2910766.0, "reward": 0.9625000357627869, "reward_std": 0.3697127103805542, "rewards/correctness_reward_func/mean": 0.699999988079071, "rewards/correctness_reward_func/std": 0.43064433336257935, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.06784005463123322, "step": 87 }, { "completion_length": 1930.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3123.0, "completions/max_terminated_length": 3123.0, "completions/mean_length": 1930.666748046875, "completions/mean_terminated_length": 1930.666748046875, "completions/min_length": 634.0, "completions/min_terminated_length": 634.0, "epoch": 0.029850746268656716, "frac_reward_zero_std": 0.5, "grad_norm": 0.6493222713470459, "kl": 0.0, "learning_rate": 4.93616287094548e-07, "loss": -0.0047, "num_tokens": 2943504.0, "reward": 0.8291666507720947, "reward_std": 0.18534879386425018, "rewards/correctness_reward_func/mean": 0.5666666626930237, "rewards/correctness_reward_func/std": 0.5033223032951355, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.06784005463123322, "step": 88 }, { "completion_length": 1324.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3149.0, "completions/max_terminated_length": 3149.0, "completions/mean_length": 1324.75, "completions/mean_terminated_length": 1324.75, "completions/min_length": 534.0, "completions/min_terminated_length": 534.0, "epoch": 0.030189959294436908, "frac_reward_zero_std": 0.5, "grad_norm": 0.42489466071128845, "kl": 0.0, "learning_rate": 4.934437543133195e-07, "loss": -0.0001, "num_tokens": 2974185.0, "reward": 0.6208333969116211, "reward_std": 0.19391795992851257, "rewards/correctness_reward_func/mean": 0.3333333432674408, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 89 }, { "completion_length": 1459.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2928.0, "completions/max_terminated_length": 2928.0, "completions/mean_length": 1459.0833740234375, "completions/mean_terminated_length": 1459.0833740234375, "completions/min_length": 744.0, "completions/min_terminated_length": 744.0, "epoch": 0.030529172320217096, "frac_reward_zero_std": 0.5, "grad_norm": 0.0769336149096489, "kl": 0.0, "learning_rate": 4.932712215320911e-07, "loss": -0.0001, "num_tokens": 3005680.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 90 }, { "completion_length": 763.0833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1107.0, "completions/max_terminated_length": 1107.0, "completions/mean_length": 763.0833740234375, "completions/mean_terminated_length": 763.0833740234375, "completions/min_length": 507.0, "completions/min_terminated_length": 507.0, "epoch": 0.030868385345997285, "frac_reward_zero_std": 0.0, "grad_norm": 0.07657662034034729, "kl": 0.0, "learning_rate": 4.930986887508626e-07, "loss": -0.0011, "num_tokens": 3029669.0, "reward": 0.7041667699813843, "reward_std": 0.07144345343112946, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 91 }, { "completion_length": 777.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1282.0, "completions/max_terminated_length": 1282.0, "completions/mean_length": 777.0, "completions/mean_terminated_length": 777.0, "completions/min_length": 375.0, "completions/min_terminated_length": 375.0, "epoch": 0.031207598371777476, "frac_reward_zero_std": 1.0, "grad_norm": 8.553340791195296e-08, "kl": 0.0, "learning_rate": 4.929261559696342e-07, "loss": 0.0, "num_tokens": 3051119.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 92 }, { "completion_length": 2085.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5398.0, "completions/max_terminated_length": 5398.0, "completions/mean_length": 2085.75, "completions/mean_terminated_length": 2085.75, "completions/min_length": 457.0, "completions/min_terminated_length": 457.0, "epoch": 0.03154681139755767, "frac_reward_zero_std": 0.5, "grad_norm": 0.419355571269989, "kl": 0.0, "learning_rate": 4.927536231884058e-07, "loss": 0.0014, "num_tokens": 3089666.0, "reward": 0.8958333730697632, "reward_std": 0.298921674489975, "rewards/correctness_reward_func/mean": 0.6333333253860474, "rewards/correctness_reward_func/std": 0.3892494738101959, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.06784005463123322, "step": 93 }, { "completion_length": 2719.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4738.0, "completions/max_terminated_length": 4738.0, "completions/mean_length": 2719.33349609375, "completions/mean_terminated_length": 2719.33349609375, "completions/min_length": 763.0, "completions/min_terminated_length": 763.0, "epoch": 0.03188602442333786, "frac_reward_zero_std": 0.0, "grad_norm": 0.419344425201416, "kl": 0.0, "learning_rate": 4.925810904071773e-07, "loss": 0.0041, "num_tokens": 3131586.0, "reward": 0.6083334684371948, "reward_std": 0.1906316578388214, "rewards/correctness_reward_func/mean": 0.3333333432674408, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 94 }, { "completion_length": 2132.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 5472.0, "completions/max_terminated_length": 5472.0, "completions/mean_length": 2132.25, "completions/mean_terminated_length": 2132.25, "completions/min_length": 753.0, "completions/min_terminated_length": 753.0, "epoch": 0.032225237449118045, "frac_reward_zero_std": 0.0, "grad_norm": 0.3389549255371094, "kl": 0.0, "learning_rate": 4.924085576259489e-07, "loss": 0.0064, "num_tokens": 3170469.0, "reward": 1.0875000953674316, "reward_std": 0.24555771052837372, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.2696799635887146, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 95 }, { "completion_length": 1292.5000305175781, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 3358.0, "completions/mean_length": 1841.5833740234375, "completions/mean_terminated_length": 1410.0, "completions/min_length": 431.0, "completions/min_terminated_length": 431.0, "epoch": 0.032564450474898234, "frac_reward_zero_std": 0.0, "grad_norm": 0.10573876649141312, "kl": NaN, "learning_rate": 4.922360248447205e-07, "loss": -0.0043, "num_tokens": 3200067.0, "reward": 0.6375000476837158, "reward_std": 0.06934845447540283, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.23750001192092896, "rewards/format_reward_func/std": 0.07723929733037949, "step": 96 }, { "completion_length": 1452.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2044.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1452.0, "completions/mean_terminated_length": 1452.0, "completions/min_length": 820.0, "completions/min_terminated_length": 820.0, "epoch": 0.03290366350067843, "frac_reward_zero_std": 0.5, "grad_norm": 0.44858041405677795, "kl": 0.0, "learning_rate": 4.92063492063492e-07, "loss": 0.0015, "num_tokens": 3228213.0, "reward": 0.833333432674408, "reward_std": 0.20655912160873413, "rewards/correctness_reward_func/mean": 0.5333333611488342, "rewards/correctness_reward_func/std": 0.3938928246498108, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 97 }, { "completion_length": 2012.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 6447.0, "completions/max_terminated_length": 6447.0, "completions/mean_length": 2012.0, "completions/mean_terminated_length": 2012.0, "completions/min_length": 379.0, "completions/min_terminated_length": 379.0, "epoch": 0.03324287652645862, "frac_reward_zero_std": 0.0, "grad_norm": 0.17230212688446045, "kl": 0.0, "learning_rate": 4.918909592822636e-07, "loss": 0.0018, "num_tokens": 3265227.0, "reward": 1.120833396911621, "reward_std": 0.10867881774902344, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.23750001192092896, "rewards/format_reward_func/std": 0.07723929733037949, "step": 98 }, { "completion_length": 2578.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4870.0, "completions/max_terminated_length": 4870.0, "completions/mean_length": 2578.166748046875, "completions/mean_terminated_length": 2578.166748046875, "completions/min_length": 967.0, "completions/min_terminated_length": 967.0, "epoch": 0.033582089552238806, "frac_reward_zero_std": 0.5, "grad_norm": 0.5569886565208435, "kl": 0.0, "learning_rate": 4.917184265010351e-07, "loss": -0.0017, "num_tokens": 3313511.0, "reward": 0.9541667699813843, "reward_std": 0.22716552019119263, "rewards/correctness_reward_func/mean": 0.6666666865348816, "rewards/correctness_reward_func/std": 0.31139957904815674, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 99 }, { "completion_length": 715.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1045.0, "completions/max_terminated_length": 1045.0, "completions/mean_length": 715.3333740234375, "completions/mean_terminated_length": 715.3333740234375, "completions/min_length": 496.0, "completions/min_terminated_length": 496.0, "epoch": 0.033921302578018994, "frac_reward_zero_std": 0.5, "grad_norm": 0.06762003153562546, "kl": 0.0, "learning_rate": 4.915458937198068e-07, "loss": -0.0004, "num_tokens": 3333903.0, "reward": 1.1625001430511475, "reward_std": 0.04107918590307236, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.06784005463123322, "step": 100 }, { "completion_length": 2714.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 6070.0, "completions/max_terminated_length": 6070.0, "completions/mean_length": 2714.916748046875, "completions/mean_terminated_length": 2714.916748046875, "completions/min_length": 710.0, "completions/min_terminated_length": 710.0, "epoch": 0.03426051560379918, "frac_reward_zero_std": 0.0, "grad_norm": 0.32408711314201355, "kl": 0.0, "learning_rate": 4.913733609385783e-07, "loss": -0.0026, "num_tokens": 3380318.0, "reward": 0.7000000476837158, "reward_std": 0.09350207448005676, "rewards/correctness_reward_func/mean": 0.45000001788139343, "rewards/correctness_reward_func/std": 0.47577688097953796, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.07385490089654922, "step": 101 }, { "completion_length": 669.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1200.0, "completions/max_terminated_length": 1200.0, "completions/mean_length": 669.0, "completions/mean_terminated_length": 669.0, "completions/min_length": 477.0, "completions/min_terminated_length": 477.0, "epoch": 0.03459972862957938, "frac_reward_zero_std": 0.5, "grad_norm": 0.05602804571390152, "kl": 0.0, "learning_rate": 4.912008281573499e-07, "loss": -0.0003, "num_tokens": 3398468.0, "reward": 1.0750000476837158, "reward_std": 0.038729824125766754, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 102 }, { "completion_length": 2353.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 6165.0, "completions/max_terminated_length": 6165.0, "completions/mean_length": 2353.166748046875, "completions/mean_terminated_length": 2353.166748046875, "completions/min_length": 870.0, "completions/min_terminated_length": 870.0, "epoch": 0.034938941655359566, "frac_reward_zero_std": 0.5, "grad_norm": 0.4930708408355713, "kl": 0.0, "learning_rate": 4.910282953761215e-07, "loss": 0.043, "num_tokens": 3432304.0, "reward": 1.133333444595337, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.2806917726993561, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 103 }, { "completion_length": 2527.5001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 4175.0, "completions/max_terminated_length": 4175.0, "completions/mean_length": 2527.5, "completions/mean_terminated_length": 2527.5, "completions/min_length": 1459.0, "completions/min_terminated_length": 1459.0, "epoch": 0.035278154681139755, "frac_reward_zero_std": 0.5, "grad_norm": 0.2505856454372406, "kl": 0.0, "learning_rate": 4.90855762594893e-07, "loss": 0.0011, "num_tokens": 3473278.0, "reward": 1.1666667461395264, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 104 }, { "completion_length": 1665.3333740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5170.0, "completions/mean_length": 2214.416748046875, "completions/mean_terminated_length": 1816.727294921875, "completions/min_length": 827.0, "completions/min_terminated_length": 827.0, "epoch": 0.03561736770691994, "frac_reward_zero_std": 0.0, "grad_norm": 0.7407315969467163, "kl": NaN, "learning_rate": 4.906832298136646e-07, "loss": -0.0215, "num_tokens": 3503390.0, "reward": 0.783333420753479, "reward_std": 0.4725285470485687, "rewards/correctness_reward_func/mean": 0.5333333611488342, "rewards/correctness_reward_func/std": 0.3938928246498108, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.09770084172487259, "step": 105 }, { "completion_length": 3103.2501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 4550.0, "completions/max_terminated_length": 4550.0, "completions/mean_length": 3103.25, "completions/mean_terminated_length": 3103.25, "completions/min_length": 1426.0, "completions/min_terminated_length": 1426.0, "epoch": 0.03595658073270014, "frac_reward_zero_std": 0.5, "grad_norm": 0.6687279343605042, "kl": 0.0, "learning_rate": 4.905106970324361e-07, "loss": 0.0069, "num_tokens": 3552125.0, "reward": 0.8375000953674316, "reward_std": 0.2458404153585434, "rewards/correctness_reward_func/mean": 0.550000011920929, "rewards/correctness_reward_func/std": 0.4100997745990753, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 106 }, { "completion_length": 1616.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3833.0, "completions/max_terminated_length": 3833.0, "completions/mean_length": 1616.416748046875, "completions/mean_terminated_length": 1616.416748046875, "completions/min_length": 682.0, "completions/min_terminated_length": 682.0, "epoch": 0.03629579375848033, "frac_reward_zero_std": 0.0, "grad_norm": 0.5577266812324524, "kl": 0.0, "learning_rate": 4.903381642512077e-07, "loss": 0.0133, "num_tokens": 3581374.0, "reward": 1.0333333015441895, "reward_std": 0.25819891691207886, "rewards/correctness_reward_func/mean": 0.7333333492279053, "rewards/correctness_reward_func/std": 0.35505014657974243, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 107 }, { "completion_length": 1808.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3992.0, "completions/max_terminated_length": 3992.0, "completions/mean_length": 1808.25, "completions/mean_terminated_length": 1808.25, "completions/min_length": 550.0, "completions/min_terminated_length": 550.0, "epoch": 0.036635006784260515, "frac_reward_zero_std": 0.5, "grad_norm": 0.0762360617518425, "kl": 0.0, "learning_rate": 4.901656314699793e-07, "loss": 0.0011, "num_tokens": 3614665.0, "reward": 1.254166603088379, "reward_std": 0.05103101581335068, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 108 }, { "completion_length": 1865.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3235.0, "completions/max_terminated_length": 3235.0, "completions/mean_length": 1865.8333740234375, "completions/mean_terminated_length": 1865.8333740234375, "completions/min_length": 726.0, "completions/min_terminated_length": 726.0, "epoch": 0.036974219810040704, "frac_reward_zero_std": 0.5, "grad_norm": 0.45066243410110474, "kl": 0.0, "learning_rate": 4.899930986887508e-07, "loss": 0.0067, "num_tokens": 3649031.0, "reward": 1.0333335399627686, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.7333333492279053, "rewards/correctness_reward_func/std": 0.23094011843204498, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 109 }, { "completion_length": 1545.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3815.0, "completions/max_terminated_length": 3815.0, "completions/mean_length": 1545.3333740234375, "completions/mean_terminated_length": 1545.3333740234375, "completions/min_length": 562.0, "completions/min_terminated_length": 562.0, "epoch": 0.03731343283582089, "frac_reward_zero_std": 0.0, "grad_norm": 0.9093514680862427, "kl": 0.0, "learning_rate": 4.898205659075224e-07, "loss": 0.0008, "num_tokens": 3681831.0, "reward": 1.0250000953674316, "reward_std": 0.3382870554924011, "rewards/correctness_reward_func/mean": 0.75, "rewards/correctness_reward_func/std": 0.36306774616241455, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 110 }, { "completion_length": 2410.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5848.0, "completions/max_terminated_length": 5848.0, "completions/mean_length": 2410.916748046875, "completions/mean_terminated_length": 2410.916748046875, "completions/min_length": 463.0, "completions/min_terminated_length": 463.0, "epoch": 0.03765264586160109, "frac_reward_zero_std": 0.0, "grad_norm": 0.689710795879364, "kl": 0.0, "learning_rate": 4.89648033126294e-07, "loss": 0.0, "num_tokens": 3725348.0, "reward": 0.9833333492279053, "reward_std": 0.3129711151123047, "rewards/correctness_reward_func/mean": 0.6833333373069763, "rewards/correctness_reward_func/std": 0.4217568039894104, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 111 }, { "completion_length": 1068.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2515.0, "completions/max_terminated_length": 2515.0, "completions/mean_length": 1068.666748046875, "completions/mean_terminated_length": 1068.666748046875, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "epoch": 0.037991858887381276, "frac_reward_zero_std": 0.5, "grad_norm": 0.5011056661605835, "kl": 0.0, "learning_rate": 4.894755003450655e-07, "loss": 0.0033, "num_tokens": 3749494.0, "reward": 0.9541667699813843, "reward_std": 0.22716552019119263, "rewards/correctness_reward_func/mean": 0.6666666865348816, "rewards/correctness_reward_func/std": 0.31139957904815674, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 112 }, { "completion_length": 1666.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4317.0, "completions/max_terminated_length": 4317.0, "completions/mean_length": 1666.5833740234375, "completions/mean_terminated_length": 1666.5833740234375, "completions/min_length": 685.0, "completions/min_terminated_length": 685.0, "epoch": 0.038331071913161464, "frac_reward_zero_std": 0.5, "grad_norm": 0.4090554416179657, "kl": 0.0, "learning_rate": 4.893029675638371e-07, "loss": 0.0057, "num_tokens": 3779723.0, "reward": 0.6333333253860474, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.3333333432674408, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 113 }, { "completion_length": 2164.5000610351562, "completions/clipped_ratio": 0.33333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5738.0, "completions/mean_length": 4360.83349609375, "completions/mean_terminated_length": 3246.75, "completions/min_length": 1838.0, "completions/min_terminated_length": 1838.0, "epoch": 0.03867028493894165, "frac_reward_zero_std": 0.0, "grad_norm": 0.8448308706283569, "kl": NaN, "learning_rate": 4.891304347826087e-07, "loss": -0.0637, "num_tokens": 3821321.0, "reward": 0.3499999940395355, "reward_std": 0.4247293472290039, "rewards/correctness_reward_func/mean": 0.14999999105930328, "rewards/correctness_reward_func/std": 0.35290998220443726, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.14770980179309845, "step": 114 }, { "completion_length": 1456.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2937.0, "completions/max_terminated_length": 2937.0, "completions/mean_length": 1456.25, "completions/mean_terminated_length": 1456.25, "completions/min_length": 457.0, "completions/min_terminated_length": 457.0, "epoch": 0.03900949796472185, "frac_reward_zero_std": 0.5, "grad_norm": 0.050372909754514694, "kl": 0.0, "learning_rate": 4.889579020013803e-07, "loss": 0.0004, "num_tokens": 3846560.0, "reward": 0.75, "reward_std": 0.03872983902692795, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.07385490089654922, "step": 115 }, { "completion_length": 3040.0001220703125, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4961.0, "completions/mean_length": 3589.08349609375, "completions/mean_terminated_length": 3316.36376953125, "completions/min_length": 1877.0, "completions/min_terminated_length": 1877.0, "epoch": 0.03934871099050204, "frac_reward_zero_std": 0.0, "grad_norm": 0.7781913876533508, "kl": NaN, "learning_rate": 4.887853692201518e-07, "loss": -0.038, "num_tokens": 3899264.0, "reward": 1.0250000953674316, "reward_std": 0.26536139845848083, "rewards/correctness_reward_func/mean": 0.75, "rewards/correctness_reward_func/std": 0.24308621883392334, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 116 }, { "completion_length": 3644.2501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 6581.0, "completions/max_terminated_length": 6581.0, "completions/mean_length": 3644.25, "completions/mean_terminated_length": 3644.25, "completions/min_length": 1230.0, "completions/min_terminated_length": 1230.0, "epoch": 0.039687924016282225, "frac_reward_zero_std": 0.5, "grad_norm": 0.624457597732544, "kl": 0.0, "learning_rate": 4.886128364389234e-07, "loss": 0.0081, "num_tokens": 3961217.0, "reward": 1.1375000476837158, "reward_std": 0.20600365102291107, "rewards/correctness_reward_func/mean": 0.8499999642372131, "rewards/correctness_reward_func/std": 0.28444525599479675, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 117 }, { "completion_length": 2229.5001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 3979.0, "completions/max_terminated_length": 3979.0, "completions/mean_length": 2229.5, "completions/mean_terminated_length": 2229.5, "completions/min_length": 1003.0, "completions/min_terminated_length": 1003.0, "epoch": 0.04002713704206241, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.88440303657695e-07, "loss": 0.0, "num_tokens": 4004513.0, "reward": 0.30000001192092896, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 118 }, { "completion_length": 2228.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3815.0, "completions/max_terminated_length": 3815.0, "completions/mean_length": 2228.416748046875, "completions/mean_terminated_length": 2228.416748046875, "completions/min_length": 864.0, "completions/min_terminated_length": 864.0, "epoch": 0.0403663500678426, "frac_reward_zero_std": 0.0, "grad_norm": 0.7541438937187195, "kl": 0.0, "learning_rate": 4.882677708764665e-07, "loss": -0.0092, "num_tokens": 4043008.0, "reward": 0.7000000476837158, "reward_std": 0.41311824321746826, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 119 }, { "completion_length": 2333.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 5397.0, "completions/max_terminated_length": 5397.0, "completions/mean_length": 2333.83349609375, "completions/mean_terminated_length": 2333.83349609375, "completions/min_length": 489.0, "completions/min_terminated_length": 489.0, "epoch": 0.0407055630936228, "frac_reward_zero_std": 0.0, "grad_norm": 0.7183325290679932, "kl": 0.0, "learning_rate": 4.880952380952381e-07, "loss": 0.0125, "num_tokens": 4085732.0, "reward": 0.970833420753479, "reward_std": 0.3699861466884613, "rewards/correctness_reward_func/mean": 0.6833333969116211, "rewards/correctness_reward_func/std": 0.32427075505256653, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 120 }, { "completion_length": 2344.83349609375, "completions/clipped_ratio": 0.0, "completions/max_length": 5423.0, "completions/max_terminated_length": 5423.0, "completions/mean_length": 2344.83349609375, "completions/mean_terminated_length": 2344.83349609375, "completions/min_length": 556.0, "completions/min_terminated_length": 556.0, "epoch": 0.041044776119402986, "frac_reward_zero_std": 0.0, "grad_norm": 0.18063339591026306, "kl": 0.0, "learning_rate": 4.879227053140096e-07, "loss": 0.0024, "num_tokens": 4126794.0, "reward": 1.2291667461395264, "reward_std": 0.0927189290523529, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.06784005463123322, "step": 121 }, { "completion_length": 2123.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3868.0, "completions/max_terminated_length": 3868.0, "completions/mean_length": 2123.166748046875, "completions/mean_terminated_length": 2123.166748046875, "completions/min_length": 1150.0, "completions/min_terminated_length": 1150.0, "epoch": 0.041383989145183174, "frac_reward_zero_std": 0.5, "grad_norm": 0.7253784537315369, "kl": 0.0, "learning_rate": 4.877501725327812e-07, "loss": 0.0166, "num_tokens": 4164896.0, "reward": 1.120833396911621, "reward_std": 0.19391795992851257, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.2806917726993561, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 122 }, { "completion_length": 1970.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 4138.0, "completions/max_terminated_length": 4138.0, "completions/mean_length": 1970.416748046875, "completions/mean_terminated_length": 1970.416748046875, "completions/min_length": 1346.0, "completions/min_terminated_length": 1346.0, "epoch": 0.04172320217096336, "frac_reward_zero_std": 0.5, "grad_norm": 0.3625893294811249, "kl": 0.0, "learning_rate": 4.875776397515527e-07, "loss": 0.005, "num_tokens": 4200325.0, "reward": 1.0208334922790527, "reward_std": 0.19391795992851257, "rewards/correctness_reward_func/mean": 0.7333333492279053, "rewards/correctness_reward_func/std": 0.23094011843204498, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 123 }, { "completion_length": 2338.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 6154.0, "completions/max_terminated_length": 6154.0, "completions/mean_length": 2338.666748046875, "completions/mean_terminated_length": 2338.666748046875, "completions/min_length": 917.0, "completions/min_terminated_length": 917.0, "epoch": 0.04206241519674356, "frac_reward_zero_std": 0.5, "grad_norm": 0.6652452349662781, "kl": 0.0, "learning_rate": 4.874051069703243e-07, "loss": 0.0208, "num_tokens": 4235175.0, "reward": 0.46250003576278687, "reward_std": 0.26016825437545776, "rewards/correctness_reward_func/mean": 0.20000000298023224, "rewards/correctness_reward_func/std": 0.36181363463401794, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.06784005463123322, "step": 124 }, { "completion_length": 1641.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3666.0, "completions/max_terminated_length": 3666.0, "completions/mean_length": 1641.75, "completions/mean_terminated_length": 1641.75, "completions/min_length": 552.0, "completions/min_terminated_length": 552.0, "epoch": 0.042401628222523746, "frac_reward_zero_std": 0.0, "grad_norm": 0.5879623889923096, "kl": 0.0, "learning_rate": 4.872325741890959e-07, "loss": 0.0249, "num_tokens": 4266114.0, "reward": 1.1041667461395264, "reward_std": 0.23474274575710297, "rewards/correctness_reward_func/mean": 0.8166666030883789, "rewards/correctness_reward_func/std": 0.27579087018966675, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 125 }, { "completion_length": 2013.8333740234375, "completions/clipped_ratio": 0.33333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6537.0, "completions/mean_length": 4210.1669921875, "completions/mean_terminated_length": 3020.75, "completions/min_length": 1015.0, "completions/min_terminated_length": 1015.0, "epoch": 0.042740841248303935, "frac_reward_zero_std": 0.5, "grad_norm": 0.6589382290840149, "kl": NaN, "learning_rate": 4.870600414078675e-07, "loss": -0.093, "num_tokens": 4304308.0, "reward": 0.7333334684371948, "reward_std": 0.2840188145637512, "rewards/correctness_reward_func/mean": 0.5333333611488342, "rewards/correctness_reward_func/std": 0.3938928246498108, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.14770980179309845, "step": 126 }, { "completion_length": 1316.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 2623.0, "completions/max_terminated_length": 2623.0, "completions/mean_length": 1316.916748046875, "completions/mean_terminated_length": 1316.916748046875, "completions/min_length": 551.0, "completions/min_terminated_length": 551.0, "epoch": 0.04308005427408412, "frac_reward_zero_std": 0.0, "grad_norm": 0.490523099899292, "kl": 0.0, "learning_rate": 4.86887508626639e-07, "loss": -0.0003, "num_tokens": 4331583.0, "reward": 1.1708333492279053, "reward_std": 0.2863824963569641, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 127 }, { "completion_length": 956.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1425.0, "completions/max_terminated_length": 1425.0, "completions/mean_length": 956.1666870117188, "completions/mean_terminated_length": 956.1666870117188, "completions/min_length": 601.0, "completions/min_terminated_length": 601.0, "epoch": 0.04341926729986431, "frac_reward_zero_std": 0.5, "grad_norm": 0.34587809443473816, "kl": 0.0, "learning_rate": 4.867149758454106e-07, "loss": -0.0019, "num_tokens": 4355141.0, "reward": 0.8625000715255737, "reward_std": 0.26016825437545776, "rewards/correctness_reward_func/mean": 0.6000000238418579, "rewards/correctness_reward_func/std": 0.36181363463401794, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.06784005463123322, "step": 128 }, { "completion_length": 831.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1665.0, "completions/max_terminated_length": 1665.0, "completions/mean_length": 831.0833740234375, "completions/mean_terminated_length": 831.0833740234375, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 0.04375848032564451, "frac_reward_zero_std": 0.5, "grad_norm": 0.08698862046003342, "kl": 0.0, "learning_rate": 4.865424430641822e-07, "loss": -0.0014, "num_tokens": 4378542.0, "reward": 0.7166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 129 }, { "completion_length": 2297.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 5522.0, "completions/max_terminated_length": 5522.0, "completions/mean_length": 2297.75, "completions/mean_terminated_length": 2297.75, "completions/min_length": 1058.0, "completions/min_terminated_length": 1058.0, "epoch": 0.044097693351424695, "frac_reward_zero_std": 0.5, "grad_norm": 0.546366810798645, "kl": 0.0, "learning_rate": 4.863699102829538e-07, "loss": -0.0185, "num_tokens": 4411467.0, "reward": 1.1041667461395264, "reward_std": 0.23474276065826416, "rewards/correctness_reward_func/mean": 0.8166666030883789, "rewards/correctness_reward_func/std": 0.27579087018966675, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 130 }, { "completion_length": 2667.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4986.0, "completions/max_terminated_length": 4986.0, "completions/mean_length": 2667.33349609375, "completions/mean_terminated_length": 2667.33349609375, "completions/min_length": 1077.0, "completions/min_terminated_length": 1077.0, "epoch": 0.044436906377204884, "frac_reward_zero_std": 0.5, "grad_norm": 0.11569435894489288, "kl": 0.0, "learning_rate": 4.861973775017253e-07, "loss": -0.0001, "num_tokens": 4458631.0, "reward": 1.1500000953674316, "reward_std": 0.05477222427725792, "rewards/correctness_reward_func/mean": 0.8499999642372131, "rewards/correctness_reward_func/std": 0.09045340120792389, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 131 }, { "completion_length": 1830.0000610351562, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 3971.0, "completions/mean_length": 2379.08349609375, "completions/mean_terminated_length": 1996.3636474609375, "completions/min_length": 839.0, "completions/min_terminated_length": 839.0, "epoch": 0.04477611940298507, "frac_reward_zero_std": 0.0, "grad_norm": 0.2605733573436737, "kl": NaN, "learning_rate": 4.860248447204969e-07, "loss": -0.0351, "num_tokens": 4493173.0, "reward": 1.0250000953674316, "reward_std": 0.26536139845848083, "rewards/correctness_reward_func/mean": 0.75, "rewards/correctness_reward_func/std": 0.24308621883392334, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 132 }, { "completion_length": 2761.5001220703125, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5889.0, "completions/mean_length": 3310.58349609375, "completions/mean_terminated_length": 3012.545654296875, "completions/min_length": 1545.0, "completions/min_terminated_length": 1545.0, "epoch": 0.04511533242876527, "frac_reward_zero_std": 0.0, "grad_norm": 0.6202500462532043, "kl": NaN, "learning_rate": 4.858523119392685e-07, "loss": -0.0471, "num_tokens": 4536811.0, "reward": 0.6458333730697632, "reward_std": 0.28442299365997314, "rewards/correctness_reward_func/mean": 0.38333332538604736, "rewards/correctness_reward_func/std": 0.4783177673816681, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.09323723614215851, "step": 133 }, { "completion_length": 1869.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3734.0, "completions/max_terminated_length": 3734.0, "completions/mean_length": 1869.666748046875, "completions/mean_terminated_length": 1869.666748046875, "completions/min_length": 1113.0, "completions/min_terminated_length": 1113.0, "epoch": 0.045454545454545456, "frac_reward_zero_std": 1.0, "grad_norm": 1.1787030729237813e-07, "kl": 0.0, "learning_rate": 4.8567977915804e-07, "loss": 0.0, "num_tokens": 4570359.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 134 }, { "completion_length": 1182.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2124.0, "completions/max_terminated_length": 2124.0, "completions/mean_length": 1182.666748046875, "completions/mean_terminated_length": 1182.666748046875, "completions/min_length": 483.0, "completions/min_terminated_length": 483.0, "epoch": 0.045793758480325644, "frac_reward_zero_std": 0.5, "grad_norm": 0.0750177651643753, "kl": 0.0, "learning_rate": 4.855072463768116e-07, "loss": 0.001, "num_tokens": 4593809.0, "reward": 1.133333444595337, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 135 }, { "completion_length": 1200.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 5137.0, "completions/max_terminated_length": 5137.0, "completions/mean_length": 1200.5, "completions/mean_terminated_length": 1200.5, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.04613297150610583, "frac_reward_zero_std": 0.5, "grad_norm": 0.5664093494415283, "kl": 0.0, "learning_rate": 4.853347135955831e-07, "loss": 0.0321, "num_tokens": 4620119.0, "reward": 1.1666667461395264, "reward_std": 0.18618986010551453, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.287096232175827, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 136 }, { "completion_length": 2200.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3977.0, "completions/max_terminated_length": 3977.0, "completions/mean_length": 2200.08349609375, "completions/mean_terminated_length": 2200.08349609375, "completions/min_length": 997.0, "completions/min_terminated_length": 997.0, "epoch": 0.04647218453188602, "frac_reward_zero_std": 0.5, "grad_norm": 0.42219477891921997, "kl": 0.0, "learning_rate": 4.851621808143547e-07, "loss": -0.0225, "num_tokens": 4662480.0, "reward": 1.0375001430511475, "reward_std": 0.20600365102291107, "rewards/correctness_reward_func/mean": 0.75, "rewards/correctness_reward_func/std": 0.24308621883392334, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 137 }, { "completion_length": 1934.75, "completions/clipped_ratio": 0.41666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 5036.0, "completions/mean_length": 4680.1669921875, "completions/mean_terminated_length": 3316.71435546875, "completions/min_length": 2530.0, "completions/min_terminated_length": 2530.0, "epoch": 0.046811397557666216, "frac_reward_zero_std": 0.5, "grad_norm": 0.17448575794696808, "kl": NaN, "learning_rate": 4.849896480331262e-07, "loss": -0.0143, "num_tokens": 4698939.0, "reward": 0.5875000953674316, "reward_std": 0.06274950504302979, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.1875, "rewards/format_reward_func/std": 0.14479610323905945, "step": 138 }, { "completion_length": 1352.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2818.0, "completions/max_terminated_length": 2818.0, "completions/mean_length": 1352.3333740234375, "completions/mean_terminated_length": 1352.3333740234375, "completions/min_length": 668.0, "completions/min_terminated_length": 668.0, "epoch": 0.047150610583446405, "frac_reward_zero_std": 0.5, "grad_norm": 0.29719895124435425, "kl": 0.0, "learning_rate": 4.848171152518978e-07, "loss": -0.0035, "num_tokens": 4724791.0, "reward": 1.1375000476837158, "reward_std": 0.20600365102291107, "rewards/correctness_reward_func/mean": 0.8499999642372131, "rewards/correctness_reward_func/std": 0.28444522619247437, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 139 }, { "completion_length": 2913.33349609375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4754.0, "completions/mean_length": 3462.416748046875, "completions/mean_terminated_length": 3178.181884765625, "completions/min_length": 736.0, "completions/min_terminated_length": 736.0, "epoch": 0.04748982360922659, "frac_reward_zero_std": 0.5, "grad_norm": 0.5751773118972778, "kl": NaN, "learning_rate": 4.846445824706694e-07, "loss": -0.017, "num_tokens": 4769939.0, "reward": 0.4208333492279053, "reward_std": 0.21818380057811737, "rewards/correctness_reward_func/mean": 0.13333334028720856, "rewards/correctness_reward_func/std": 0.31139957904815674, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 140 }, { "completion_length": 1474.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2125.0, "completions/max_terminated_length": 2125.0, "completions/mean_length": 1474.916748046875, "completions/mean_terminated_length": 1474.916748046875, "completions/min_length": 960.0, "completions/min_terminated_length": 960.0, "epoch": 0.04782903663500678, "frac_reward_zero_std": 0.5, "grad_norm": 0.07152793556451797, "kl": 0.0, "learning_rate": 4.84472049689441e-07, "loss": 0.0009, "num_tokens": 4803178.0, "reward": 1.1875, "reward_std": 0.03061862848699093, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 141 }, { "completion_length": 1793.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2506.0, "completions/max_terminated_length": 2506.0, "completions/mean_length": 1793.916748046875, "completions/mean_terminated_length": 1793.916748046875, "completions/min_length": 1249.0, "completions/min_terminated_length": 1249.0, "epoch": 0.04816824966078698, "frac_reward_zero_std": 0.0, "grad_norm": 0.1186007633805275, "kl": 0.0, "learning_rate": 4.842995169082126e-07, "loss": -0.0018, "num_tokens": 4837575.0, "reward": 1.1041667461395264, "reward_std": 0.07144343852996826, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 142 }, { "completion_length": 1622.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 4223.0, "completions/max_terminated_length": 4223.0, "completions/mean_length": 1622.25, "completions/mean_terminated_length": 1622.25, "completions/min_length": 823.0, "completions/min_terminated_length": 823.0, "epoch": 0.048507462686567165, "frac_reward_zero_std": 0.5, "grad_norm": 0.09311472624540329, "kl": 0.0, "learning_rate": 4.841269841269841e-07, "loss": -0.001, "num_tokens": 4868670.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 143 }, { "completion_length": 1624.75, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 4910.0, "completions/mean_length": 2722.916748046875, "completions/mean_terminated_length": 1949.7000732421875, "completions/min_length": 699.0, "completions/min_terminated_length": 699.0, "epoch": 0.048846675712347354, "frac_reward_zero_std": 0.0, "grad_norm": 0.25399917364120483, "kl": NaN, "learning_rate": 4.839544513457557e-07, "loss": -0.0161, "num_tokens": 4895865.0, "reward": 0.6666667461395264, "reward_std": 0.11828449368476868, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.2500000298023224, "rewards/format_reward_func/std": 0.11677484214305878, "step": 144 }, { "completion_length": 1415.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3606.0, "completions/max_terminated_length": 3606.0, "completions/mean_length": 1415.75, "completions/mean_terminated_length": 1415.75, "completions/min_length": 489.0, "completions/min_terminated_length": 489.0, "epoch": 0.04918588873812754, "frac_reward_zero_std": 0.5, "grad_norm": 0.07228909432888031, "kl": 0.0, "learning_rate": 4.837819185645272e-07, "loss": -0.0003, "num_tokens": 4924656.0, "reward": 1.120833396911621, "reward_std": 0.04005204886198044, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 145 }, { "completion_length": 549.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 626.0, "completions/max_terminated_length": 626.0, "completions/mean_length": 549.4166870117188, "completions/mean_terminated_length": 549.4166870117188, "completions/min_length": 376.0, "completions/min_terminated_length": 376.0, "epoch": 0.04952510176390773, "frac_reward_zero_std": 0.0, "grad_norm": 0.055849168449640274, "kl": 0.0, "learning_rate": 4.836093857832988e-07, "loss": 0.0002, "num_tokens": 4948643.0, "reward": 1.1541666984558105, "reward_std": 0.06024051457643509, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298573225736618, "rewards/format_reward_func/mean": 0.23750001192092896, "rewards/format_reward_func/std": 0.07723929733037949, "step": 146 }, { "completion_length": 1790.916748046875, "completions/clipped_ratio": 0.41666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 6085.0, "completions/mean_length": 4536.33349609375, "completions/mean_terminated_length": 3070.14306640625, "completions/min_length": 686.0, "completions/min_terminated_length": 686.0, "epoch": 0.049864314789687926, "frac_reward_zero_std": 0.5, "grad_norm": 1.2127513885498047, "kl": NaN, "learning_rate": 4.834368530020704e-07, "loss": -0.0705, "num_tokens": 4982428.0, "reward": 0.6416667699813843, "reward_std": 0.22453658282756805, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.17499999701976776, "rewards/format_reward_func/std": 0.15447859466075897, "step": 147 }, { "completion_length": 2748.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3929.0, "completions/max_terminated_length": 3929.0, "completions/mean_length": 2748.416748046875, "completions/mean_terminated_length": 2748.416748046875, "completions/min_length": 971.0, "completions/min_terminated_length": 971.0, "epoch": 0.050203527815468114, "frac_reward_zero_std": 0.5, "grad_norm": 0.5216585397720337, "kl": 0.0, "learning_rate": 4.83264320220842e-07, "loss": -0.0151, "num_tokens": 5029227.0, "reward": 0.833333432674408, "reward_std": 0.20655912160873413, "rewards/correctness_reward_func/mean": 0.5333333611488342, "rewards/correctness_reward_func/std": 0.3938928246498108, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 148 }, { "completion_length": 1443.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2598.0, "completions/max_terminated_length": 2598.0, "completions/mean_length": 1443.416748046875, "completions/mean_terminated_length": 1443.416748046875, "completions/min_length": 442.0, "completions/min_terminated_length": 442.0, "epoch": 0.0505427408412483, "frac_reward_zero_std": 0.0, "grad_norm": 0.31681346893310547, "kl": 0.0, "learning_rate": 4.830917874396135e-07, "loss": -0.0102, "num_tokens": 5056202.0, "reward": 0.6458333730697632, "reward_std": 0.2625694274902344, "rewards/correctness_reward_func/mean": 0.38333332538604736, "rewards/correctness_reward_func/std": 0.4783177673816681, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.06784005463123322, "step": 149 }, { "completion_length": 2423.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 5794.0, "completions/max_terminated_length": 5794.0, "completions/mean_length": 2423.166748046875, "completions/mean_terminated_length": 2423.166748046875, "completions/min_length": 541.0, "completions/min_terminated_length": 541.0, "epoch": 0.05088195386702849, "frac_reward_zero_std": 0.5, "grad_norm": 0.12223686277866364, "kl": 0.0, "learning_rate": 4.829192546583851e-07, "loss": -0.0021, "num_tokens": 5094136.0, "reward": 1.2333333492279053, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.9333333373069763, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 150 }, { "completion_length": 2325.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 5329.0, "completions/max_terminated_length": 5329.0, "completions/mean_length": 2325.666748046875, "completions/mean_terminated_length": 2325.666748046875, "completions/min_length": 751.0, "completions/min_terminated_length": 751.0, "epoch": 0.05122116689280869, "frac_reward_zero_std": 0.0, "grad_norm": 0.9536461234092712, "kl": 0.0, "learning_rate": 4.827467218771566e-07, "loss": -0.0068, "num_tokens": 5137452.0, "reward": 0.7416666746139526, "reward_std": 0.4643779993057251, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 151 }, { "completion_length": 885.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1787.0, "completions/max_terminated_length": 1787.0, "completions/mean_length": 885.8333740234375, "completions/mean_terminated_length": 885.8333740234375, "completions/min_length": 605.0, "completions/min_terminated_length": 605.0, "epoch": 0.051560379918588875, "frac_reward_zero_std": 0.0, "grad_norm": 0.8494682908058167, "kl": 0.0, "learning_rate": 4.825741890959282e-07, "loss": 0.0056, "num_tokens": 5155720.0, "reward": 0.9250000715255737, "reward_std": 0.2563120126724243, "rewards/correctness_reward_func/mean": 0.699999988079071, "rewards/correctness_reward_func/std": 0.3357488214969635, "rewards/format_reward_func/mean": 0.22500000894069672, "rewards/format_reward_func/std": 0.07833494991064072, "step": 152 }, { "completion_length": 1869.3334350585938, "completions/clipped_ratio": 0.0, "completions/max_length": 4111.0, "completions/max_terminated_length": 4111.0, "completions/mean_length": 1869.3333740234375, "completions/mean_terminated_length": 1869.3333740234375, "completions/min_length": 547.0, "completions/min_terminated_length": 547.0, "epoch": 0.05189959294436906, "frac_reward_zero_std": 0.0, "grad_norm": 0.1964230239391327, "kl": 0.0, "learning_rate": 4.824016563146997e-07, "loss": 0.0004, "num_tokens": 5188076.0, "reward": 1.2000000476837158, "reward_std": 0.08164963126182556, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 153 }, { "completion_length": 1415.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2859.0, "completions/max_terminated_length": 2859.0, "completions/mean_length": 1415.666748046875, "completions/mean_terminated_length": 1415.666748046875, "completions/min_length": 632.0, "completions/min_terminated_length": 632.0, "epoch": 0.05223880597014925, "frac_reward_zero_std": 0.0, "grad_norm": 0.35508859157562256, "kl": 0.0, "learning_rate": 4.822291235334713e-07, "loss": 0.0054, "num_tokens": 5217724.0, "reward": 0.7416666746139526, "reward_std": 0.20202915370464325, "rewards/correctness_reward_func/mean": 0.46666669845581055, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 154 }, { "completion_length": 2550.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5538.0, "completions/max_terminated_length": 5538.0, "completions/mean_length": 2550.166748046875, "completions/mean_terminated_length": 2550.166748046875, "completions/min_length": 971.0, "completions/min_terminated_length": 971.0, "epoch": 0.05257801899592944, "frac_reward_zero_std": 0.5, "grad_norm": 0.22098730504512787, "kl": 0.0, "learning_rate": 4.82056590752243e-07, "loss": 0.0016, "num_tokens": 5257764.0, "reward": 1.120833396911621, "reward_std": 0.06785397976636887, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 155 }, { "completion_length": 693.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1217.0, "completions/max_terminated_length": 1217.0, "completions/mean_length": 693.25, "completions/mean_terminated_length": 693.25, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.052917232021709636, "frac_reward_zero_std": 0.5, "grad_norm": 0.05356336012482643, "kl": 0.0, "learning_rate": 4.818840579710144e-07, "loss": 0.0001, "num_tokens": 5279343.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 156 }, { "completion_length": 840.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1473.0, "completions/max_terminated_length": 1473.0, "completions/mean_length": 840.4166870117188, "completions/mean_terminated_length": 840.4166870117188, "completions/min_length": 476.0, "completions/min_terminated_length": 476.0, "epoch": 0.053256445047489824, "frac_reward_zero_std": 0.5, "grad_norm": 0.38578730821609497, "kl": 0.0, "learning_rate": 4.817115251897861e-07, "loss": -0.0039, "num_tokens": 5303306.0, "reward": 0.5416666269302368, "reward_std": 0.24528895318508148, "rewards/correctness_reward_func/mean": 0.2666666805744171, "rewards/correctness_reward_func/std": 0.393892765045166, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 157 }, { "completion_length": 2539.666748046875, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 6584.0, "completions/mean_length": 3637.83349609375, "completions/mean_terminated_length": 3047.60009765625, "completions/min_length": 1685.0, "completions/min_terminated_length": 1685.0, "epoch": 0.05359565807327001, "frac_reward_zero_std": 0.0, "grad_norm": 0.7518226504325867, "kl": NaN, "learning_rate": 4.815389924085576e-07, "loss": -0.0562, "num_tokens": 5345350.0, "reward": 0.8791667819023132, "reward_std": 0.4417826533317566, "rewards/correctness_reward_func/mean": 0.6166666746139526, "rewards/correctness_reward_func/std": 0.37618499994277954, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.09323723614215851, "step": 158 }, { "completion_length": 1451.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 4533.0, "completions/max_terminated_length": 4533.0, "completions/mean_length": 1451.666748046875, "completions/mean_terminated_length": 1451.666748046875, "completions/min_length": 489.0, "completions/min_terminated_length": 489.0, "epoch": 0.0539348710990502, "frac_reward_zero_std": 0.5, "grad_norm": 0.09454381465911865, "kl": 0.0, "learning_rate": 4.813664596273292e-07, "loss": -0.0008, "num_tokens": 5374308.0, "reward": 1.1041667461395264, "reward_std": 0.05571504682302475, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 159 }, { "completion_length": 3554.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 5146.0, "completions/max_terminated_length": 5146.0, "completions/mean_length": 3554.08349609375, "completions/mean_terminated_length": 3554.08349609375, "completions/min_length": 1727.0, "completions/min_terminated_length": 1727.0, "epoch": 0.054274084124830396, "frac_reward_zero_std": 1.0, "grad_norm": 5.624274308502208e-07, "kl": 0.0, "learning_rate": 4.811939268461007e-07, "loss": 0.0, "num_tokens": 5428465.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 160 }, { "completion_length": 1793.2500610351562, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 5850.0, "completions/mean_length": 2891.416748046875, "completions/mean_terminated_length": 2151.900146484375, "completions/min_length": 509.0, "completions/min_terminated_length": 509.0, "epoch": 0.054613297150610585, "frac_reward_zero_std": 0.0, "grad_norm": 0.6140305995941162, "kl": NaN, "learning_rate": 4.810213940648723e-07, "loss": -0.102, "num_tokens": 5464558.0, "reward": 1.0166666507720947, "reward_std": 0.3798363208770752, "rewards/correctness_reward_func/mean": 0.7666666507720947, "rewards/correctness_reward_func/std": 0.3700941801071167, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.11677484214305878, "step": 161 }, { "completion_length": 2627.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 6292.0, "completions/max_terminated_length": 6292.0, "completions/mean_length": 2627.666748046875, "completions/mean_terminated_length": 2627.666748046875, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 0.05495251017639077, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.808488612836439e-07, "loss": 0.0, "num_tokens": 5509680.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 162 }, { "completion_length": 2503.8333740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4827.0, "completions/mean_length": 3052.916748046875, "completions/mean_terminated_length": 2731.45458984375, "completions/min_length": 867.0, "completions/min_terminated_length": 867.0, "epoch": 0.05529172320217096, "frac_reward_zero_std": 0.0, "grad_norm": 0.7075985670089722, "kl": NaN, "learning_rate": 4.806763285024155e-07, "loss": 0.0062, "num_tokens": 5549920.0, "reward": 0.8583333492279053, "reward_std": 0.4465666711330414, "rewards/correctness_reward_func/mean": 0.5833333134651184, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 163 }, { "completion_length": 2481.7501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 4697.0, "completions/max_terminated_length": 4697.0, "completions/mean_length": 2481.75, "completions/mean_terminated_length": 2481.75, "completions/min_length": 965.0, "completions/min_terminated_length": 965.0, "epoch": 0.05563093622795115, "frac_reward_zero_std": 0.0, "grad_norm": 0.6668363213539124, "kl": 0.0, "learning_rate": 4.80503795721187e-07, "loss": -0.0086, "num_tokens": 5592253.0, "reward": 1.066666841506958, "reward_std": 0.21493908762931824, "rewards/correctness_reward_func/mean": 0.7666666507720947, "rewards/correctness_reward_func/std": 0.2534608840942383, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 164 }, { "completion_length": 1200.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2080.0, "completions/max_terminated_length": 2080.0, "completions/mean_length": 1200.416748046875, "completions/mean_terminated_length": 1200.416748046875, "completions/min_length": 654.0, "completions/min_terminated_length": 654.0, "epoch": 0.055970149253731345, "frac_reward_zero_std": 1.0, "grad_norm": 1.0779814374473062e-07, "kl": 0.0, "learning_rate": 4.803312629399586e-07, "loss": 0.0, "num_tokens": 5621964.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 165 }, { "completion_length": 1244.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3143.0, "completions/max_terminated_length": 3143.0, "completions/mean_length": 1244.5, "completions/mean_terminated_length": 1244.5, "completions/min_length": 583.0, "completions/min_terminated_length": 583.0, "epoch": 0.056309362279511534, "frac_reward_zero_std": 0.5, "grad_norm": 0.4018653631210327, "kl": 0.0, "learning_rate": 4.801587301587301e-07, "loss": 0.0058, "num_tokens": 5650680.0, "reward": 0.833333432674408, "reward_std": 0.20655912160873413, "rewards/correctness_reward_func/mean": 0.5333333611488342, "rewards/correctness_reward_func/std": 0.3938928246498108, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 166 }, { "completion_length": 620.6666717529297, "completions/clipped_ratio": 0.0, "completions/max_length": 960.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 620.6666870117188, "completions/mean_terminated_length": 620.6666870117188, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "epoch": 0.05664857530529172, "frac_reward_zero_std": 0.5, "grad_norm": 0.042549144476652145, "kl": 0.0, "learning_rate": 4.799861973775017e-07, "loss": -0.0001, "num_tokens": 5670530.0, "reward": 0.7875000834465027, "reward_std": 0.03061862289905548, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 167 }, { "completion_length": 3173.7501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 6019.0, "completions/max_terminated_length": 6019.0, "completions/mean_length": 3173.75, "completions/mean_terminated_length": 3173.75, "completions/min_length": 1452.0, "completions/min_terminated_length": 1452.0, "epoch": 0.05698778833107191, "frac_reward_zero_std": 0.0, "grad_norm": 0.9396628141403198, "kl": 0.0, "learning_rate": 4.798136645962732e-07, "loss": 0.0374, "num_tokens": 5720045.0, "reward": 0.9541667699813843, "reward_std": 0.42716550827026367, "rewards/correctness_reward_func/mean": 0.6666666269302368, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 168 }, { "completion_length": 1277.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1644.0, "completions/max_terminated_length": 1644.0, "completions/mean_length": 1277.8333740234375, "completions/mean_terminated_length": 1277.8333740234375, "completions/min_length": 650.0, "completions/min_terminated_length": 650.0, "epoch": 0.057327001356852106, "frac_reward_zero_std": 1.0, "grad_norm": 2.870347088901326e-07, "kl": 0.0, "learning_rate": 4.796411318150448e-07, "loss": 0.0, "num_tokens": 5749149.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 169 }, { "completion_length": 1964.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3663.0, "completions/max_terminated_length": 3663.0, "completions/mean_length": 1964.0, "completions/mean_terminated_length": 1964.0, "completions/min_length": 859.0, "completions/min_terminated_length": 859.0, "epoch": 0.057666214382632294, "frac_reward_zero_std": 0.5, "grad_norm": 0.475134015083313, "kl": 0.0, "learning_rate": 4.794685990338165e-07, "loss": -0.0215, "num_tokens": 5786289.0, "reward": 0.7833334803581238, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.43029236793518066, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 170 }, { "completion_length": 1437.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1939.0, "completions/max_terminated_length": 1939.0, "completions/mean_length": 1437.3333740234375, "completions/mean_terminated_length": 1437.3333740234375, "completions/min_length": 742.0, "completions/min_terminated_length": 742.0, "epoch": 0.05800542740841248, "frac_reward_zero_std": 0.0, "grad_norm": 0.4975443482398987, "kl": 0.0, "learning_rate": 4.792960662525879e-07, "loss": 0.0067, "num_tokens": 5814013.0, "reward": 0.9458333849906921, "reward_std": 0.39763349294662476, "rewards/correctness_reward_func/mean": 0.6833333969116211, "rewards/correctness_reward_func/std": 0.32427075505256653, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.06784005463123322, "step": 171 }, { "completion_length": 3024.5833740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6300.0, "completions/mean_length": 3573.666748046875, "completions/mean_terminated_length": 3299.545654296875, "completions/min_length": 1574.0, "completions/min_terminated_length": 1574.0, "epoch": 0.05834464043419267, "frac_reward_zero_std": 0.0, "grad_norm": 0.2008449286222458, "kl": NaN, "learning_rate": 4.791235334713596e-07, "loss": -0.011, "num_tokens": 5861486.0, "reward": 0.7375000715255737, "reward_std": 0.0853908509016037, "rewards/correctness_reward_func/mean": 0.44999995827674866, "rewards/correctness_reward_func/std": 0.47577688097953796, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 172 }, { "completion_length": 2863.7501220703125, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 5883.0, "completions/mean_length": 4511.0, "completions/mean_terminated_length": 3818.333251953125, "completions/min_length": 2050.0, "completions/min_terminated_length": 2050.0, "epoch": 0.05868385345997286, "frac_reward_zero_std": 0.0, "grad_norm": 0.30197957158088684, "kl": NaN, "learning_rate": 4.789510006901311e-07, "loss": -0.0339, "num_tokens": 5903651.0, "reward": 0.22500000894069672, "reward_std": 0.13869690895080566, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.22500000894069672, "rewards/format_reward_func/std": 0.13568010926246643, "step": 173 }, { "completion_length": 1077.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3142.0, "completions/max_terminated_length": 3142.0, "completions/mean_length": 1077.8333740234375, "completions/mean_terminated_length": 1077.8333740234375, "completions/min_length": 555.0, "completions/min_terminated_length": 555.0, "epoch": 0.059023066485753055, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.787784679089027e-07, "loss": 0.0, "num_tokens": 5927547.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 174 }, { "completion_length": 2095.3334350585938, "completions/clipped_ratio": 0.41666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 6126.0, "completions/mean_length": 4840.75, "completions/mean_terminated_length": 3592.000244140625, "completions/min_length": 1967.0, "completions/min_terminated_length": 1967.0, "epoch": 0.05936227951153324, "frac_reward_zero_std": 0.5, "grad_norm": 0.6622049808502197, "kl": NaN, "learning_rate": 4.786059351276742e-07, "loss": -0.071, "num_tokens": 5961595.0, "reward": 0.24166667461395264, "reward_std": 0.22453658282756805, "rewards/correctness_reward_func/mean": 0.06666667014360428, "rewards/correctness_reward_func/std": 0.2309401035308838, "rewards/format_reward_func/mean": 0.17500001192092896, "rewards/format_reward_func/std": 0.15447859466075897, "step": 175 }, { "completion_length": 2191.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3487.0, "completions/max_terminated_length": 3487.0, "completions/mean_length": 2191.75, "completions/mean_terminated_length": 2191.75, "completions/min_length": 704.0, "completions/min_terminated_length": 704.0, "epoch": 0.05970149253731343, "frac_reward_zero_std": 0.5, "grad_norm": 0.6641126275062561, "kl": 0.0, "learning_rate": 4.784334023464458e-07, "loss": -0.0095, "num_tokens": 5995228.0, "reward": 0.9000000953674316, "reward_std": 0.21908903121948242, "rewards/correctness_reward_func/mean": 0.6000000238418579, "rewards/correctness_reward_func/std": 0.36181360483169556, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 176 }, { "completion_length": 2058.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5513.0, "completions/max_terminated_length": 5513.0, "completions/mean_length": 2058.166748046875, "completions/mean_terminated_length": 2058.166748046875, "completions/min_length": 854.0, "completions/min_terminated_length": 854.0, "epoch": 0.06004070556309362, "frac_reward_zero_std": 1.0, "grad_norm": 2.813504522691801e-07, "kl": 0.0, "learning_rate": 4.782608695652174e-07, "loss": 0.0, "num_tokens": 6032094.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 177 }, { "completion_length": 2605.0001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 5421.0, "completions/max_terminated_length": 5421.0, "completions/mean_length": 2605.0, "completions/mean_terminated_length": 2605.0, "completions/min_length": 1045.0, "completions/min_terminated_length": 1045.0, "epoch": 0.060379918588873815, "frac_reward_zero_std": 0.0, "grad_norm": 0.9859559535980225, "kl": 0.0, "learning_rate": 4.780883367839889e-07, "loss": -0.0052, "num_tokens": 6076770.0, "reward": 1.0333333015441895, "reward_std": 0.25819891691207886, "rewards/correctness_reward_func/mean": 0.7333332896232605, "rewards/correctness_reward_func/std": 0.3550501763820648, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 178 }, { "completion_length": 2074.8334350585938, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4515.0, "completions/mean_length": 2623.916748046875, "completions/mean_terminated_length": 2263.45458984375, "completions/min_length": 1183.0, "completions/min_terminated_length": 1183.0, "epoch": 0.060719131614654004, "frac_reward_zero_std": 0.0, "grad_norm": 0.6357032656669617, "kl": NaN, "learning_rate": 4.779158040027605e-07, "loss": -0.0209, "num_tokens": 6112522.0, "reward": 1.0916666984558105, "reward_std": 0.26536136865615845, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.27579087018966675, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 179 }, { "completion_length": 1129.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2240.0, "completions/max_terminated_length": 2240.0, "completions/mean_length": 1129.0833740234375, "completions/mean_terminated_length": 1129.0833740234375, "completions/min_length": 559.0, "completions/min_terminated_length": 559.0, "epoch": 0.06105834464043419, "frac_reward_zero_std": 0.0, "grad_norm": 0.09178594499826431, "kl": 0.0, "learning_rate": 4.777432712215321e-07, "loss": -0.001, "num_tokens": 6134249.0, "reward": 1.25, "reward_std": 0.09246455878019333, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 180 }, { "completion_length": 2996.5, "completions/clipped_ratio": 0.0, "completions/max_length": 5630.0, "completions/max_terminated_length": 5630.0, "completions/mean_length": 2996.5, "completions/mean_terminated_length": 2996.5, "completions/min_length": 794.0, "completions/min_terminated_length": 794.0, "epoch": 0.06139755766621438, "frac_reward_zero_std": 0.5, "grad_norm": 0.6492721438407898, "kl": 0.0, "learning_rate": 4.775707384403036e-07, "loss": 0.0108, "num_tokens": 6184307.0, "reward": 0.9666668176651001, "reward_std": 0.20655910670757294, "rewards/correctness_reward_func/mean": 0.6666666865348816, "rewards/correctness_reward_func/std": 0.31139957904815674, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 181 }, { "completion_length": 2291.25, "completions/clipped_ratio": 0.0, "completions/max_length": 4261.0, "completions/max_terminated_length": 4261.0, "completions/mean_length": 2291.25, "completions/mean_terminated_length": 2291.25, "completions/min_length": 632.0, "completions/min_terminated_length": 632.0, "epoch": 0.06173677069199457, "frac_reward_zero_std": 0.5, "grad_norm": 0.625225841999054, "kl": 0.0, "learning_rate": 4.773982056590752e-07, "loss": 0.0127, "num_tokens": 6219686.0, "reward": 0.6333333253860474, "reward_std": 0.25819891691207886, "rewards/correctness_reward_func/mean": 0.3333333432674408, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 182 }, { "completion_length": 1227.0833740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 2438.0, "completions/mean_length": 1776.166748046875, "completions/mean_terminated_length": 1338.6363525390625, "completions/min_length": 740.0, "completions/min_terminated_length": 740.0, "epoch": 0.062075983717774764, "frac_reward_zero_std": 0.0, "grad_norm": 0.7521762847900391, "kl": NaN, "learning_rate": 4.772256728778468e-07, "loss": -0.0184, "num_tokens": 6244383.0, "reward": 0.9291666746139526, "reward_std": 0.41845452785491943, "rewards/correctness_reward_func/mean": 0.6666666865348816, "rewards/correctness_reward_func/std": 0.31139957904815674, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.09323723614215851, "step": 183 }, { "completion_length": 2018.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 6042.0, "completions/max_terminated_length": 6042.0, "completions/mean_length": 2018.0833740234375, "completions/mean_terminated_length": 2018.0833740234375, "completions/min_length": 479.0, "completions/min_terminated_length": 479.0, "epoch": 0.06241519674355495, "frac_reward_zero_std": 0.5, "grad_norm": 0.6339729428291321, "kl": 0.0, "learning_rate": 4.770531400966183e-07, "loss": -0.0363, "num_tokens": 6279580.0, "reward": 0.7583333849906921, "reward_std": 0.21946904063224792, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.43029236793518066, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 184 }, { "completion_length": 2131.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 4085.0, "completions/max_terminated_length": 4085.0, "completions/mean_length": 2131.0, "completions/mean_terminated_length": 2131.0, "completions/min_length": 1032.0, "completions/min_terminated_length": 1032.0, "epoch": 0.06275440976933515, "frac_reward_zero_std": 1.0, "grad_norm": 3.2719327691665967e-07, "kl": 0.0, "learning_rate": 4.7688060731539e-07, "loss": 0.0, "num_tokens": 6317164.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 185 }, { "completion_length": 1139.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 2115.0, "completions/max_terminated_length": 2115.0, "completions/mean_length": 1139.8333740234375, "completions/mean_terminated_length": 1139.8333740234375, "completions/min_length": 575.0, "completions/min_terminated_length": 575.0, "epoch": 0.06309362279511534, "frac_reward_zero_std": 1.0, "grad_norm": 1.9935814066229796e-07, "kl": 0.0, "learning_rate": 4.7670807453416146e-07, "loss": 0.0, "num_tokens": 6345572.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 186 }, { "completion_length": 2896.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4331.0, "completions/max_terminated_length": 4331.0, "completions/mean_length": 2896.08349609375, "completions/mean_terminated_length": 2896.08349609375, "completions/min_length": 1513.0, "completions/min_terminated_length": 1513.0, "epoch": 0.06343283582089553, "frac_reward_zero_std": 1.0, "grad_norm": 3.4238826174259884e-07, "kl": 0.0, "learning_rate": 4.76535541752933e-07, "loss": 0.0, "num_tokens": 6390285.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 187 }, { "completion_length": 1860.25, "completions/clipped_ratio": 0.0, "completions/max_length": 4907.0, "completions/max_terminated_length": 4907.0, "completions/mean_length": 1860.25, "completions/mean_terminated_length": 1860.25, "completions/min_length": 922.0, "completions/min_terminated_length": 922.0, "epoch": 0.06377204884667571, "frac_reward_zero_std": 0.5, "grad_norm": 1.6667431592941284, "kl": 0.0, "learning_rate": 4.763630089717046e-07, "loss": 0.0349, "num_tokens": 6425994.0, "reward": 1.0833334922790527, "reward_std": 0.19407902657985687, "rewards/correctness_reward_func/mean": 0.7833333015441895, "rewards/correctness_reward_func/std": 0.26227444410324097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 188 }, { "completion_length": 1321.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3460.0, "completions/max_terminated_length": 3460.0, "completions/mean_length": 1321.666748046875, "completions/mean_terminated_length": 1321.666748046875, "completions/min_length": 463.0, "completions/min_terminated_length": 463.0, "epoch": 0.0641112618724559, "frac_reward_zero_std": 0.5, "grad_norm": 0.37701964378356934, "kl": 0.0, "learning_rate": 4.761904761904761e-07, "loss": 0.0042, "num_tokens": 6448568.0, "reward": 0.7166666984558105, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.5149286985397339, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 189 }, { "completion_length": 2525.33349609375, "completions/clipped_ratio": 0.0, "completions/max_length": 4880.0, "completions/max_terminated_length": 4880.0, "completions/mean_length": 2525.33349609375, "completions/mean_terminated_length": 2525.33349609375, "completions/min_length": 879.0, "completions/min_terminated_length": 879.0, "epoch": 0.06445047489823609, "frac_reward_zero_std": 0.0, "grad_norm": 1.206994891166687, "kl": 0.0, "learning_rate": 4.7601794340924773e-07, "loss": 0.0475, "num_tokens": 6491160.0, "reward": 0.8333333730697632, "reward_std": 0.4772879481315613, "rewards/correctness_reward_func/mean": 0.5333333015441895, "rewards/correctness_reward_func/std": 0.47736650705337524, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 190 }, { "completion_length": 994.5833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 2085.0, "completions/max_terminated_length": 2085.0, "completions/mean_length": 994.5833740234375, "completions/mean_terminated_length": 994.5833740234375, "completions/min_length": 526.0, "completions/min_terminated_length": 526.0, "epoch": 0.06478968792401628, "frac_reward_zero_std": 1.0, "grad_norm": 2.1302626862507168e-07, "kl": 0.0, "learning_rate": 4.758454106280193e-07, "loss": 0.0, "num_tokens": 6513985.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 191 }, { "completion_length": 1926.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3138.0, "completions/max_terminated_length": 3138.0, "completions/mean_length": 1926.0, "completions/mean_terminated_length": 1926.0, "completions/min_length": 926.0, "completions/min_terminated_length": 926.0, "epoch": 0.06512890094979647, "frac_reward_zero_std": 1.0, "grad_norm": 2.765222859579808e-07, "kl": 0.0, "learning_rate": 4.756728778467909e-07, "loss": 0.0, "num_tokens": 6548065.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 192 }, { "completion_length": 2050.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3179.0, "completions/max_terminated_length": 3179.0, "completions/mean_length": 2050.666748046875, "completions/mean_terminated_length": 2050.666748046875, "completions/min_length": 708.0, "completions/min_terminated_length": 708.0, "epoch": 0.06546811397557666, "frac_reward_zero_std": 1.0, "grad_norm": 1.0529081606591717e-07, "kl": 0.0, "learning_rate": 4.7550034506556244e-07, "loss": 0.0, "num_tokens": 6586623.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 193 }, { "completion_length": 2184.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3969.0, "completions/max_terminated_length": 3969.0, "completions/mean_length": 2184.58349609375, "completions/mean_terminated_length": 2184.58349609375, "completions/min_length": 853.0, "completions/min_terminated_length": 853.0, "epoch": 0.06580732700135686, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.75327812284334e-07, "loss": 0.0, "num_tokens": 6622516.0, "reward": 0.30000001192092896, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 194 }, { "completion_length": 1352.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2634.0, "completions/max_terminated_length": 2634.0, "completions/mean_length": 1352.666748046875, "completions/mean_terminated_length": 1352.666748046875, "completions/min_length": 597.0, "completions/min_terminated_length": 597.0, "epoch": 0.06614654002713705, "frac_reward_zero_std": 0.5, "grad_norm": 0.10883874446153641, "kl": 0.0, "learning_rate": 4.751552795031056e-07, "loss": -0.0022, "num_tokens": 6653136.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 195 }, { "completion_length": 1927.0000610351562, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 4352.0, "completions/mean_length": 3025.166748046875, "completions/mean_terminated_length": 2312.400146484375, "completions/min_length": 1171.0, "completions/min_terminated_length": 1171.0, "epoch": 0.06648575305291723, "frac_reward_zero_std": 0.5, "grad_norm": 0.4869142770767212, "kl": NaN, "learning_rate": 4.7498274672187715e-07, "loss": -0.0407, "num_tokens": 6689124.0, "reward": 0.8500000834465027, "reward_std": 0.279284805059433, "rewards/correctness_reward_func/mean": 0.6000000238418579, "rewards/correctness_reward_func/std": 0.36181360483169556, "rewards/format_reward_func/mean": 0.2500000298023224, "rewards/format_reward_func/std": 0.11677484214305878, "step": 196 }, { "completion_length": 1327.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2700.0, "completions/max_terminated_length": 2700.0, "completions/mean_length": 1327.0, "completions/mean_terminated_length": 1327.0, "completions/min_length": 452.0, "completions/min_terminated_length": 452.0, "epoch": 0.06682496607869742, "frac_reward_zero_std": 1.0, "grad_norm": 9.299483849645185e-08, "kl": 0.0, "learning_rate": 4.748102139406487e-07, "loss": 0.0, "num_tokens": 6719208.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 197 }, { "completion_length": 1178.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3020.0, "completions/max_terminated_length": 3020.0, "completions/mean_length": 1178.8333740234375, "completions/mean_terminated_length": 1178.8333740234375, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "epoch": 0.06716417910447761, "frac_reward_zero_std": 0.5, "grad_norm": 0.08082883059978485, "kl": 0.0, "learning_rate": 4.7463768115942026e-07, "loss": -0.0002, "num_tokens": 6742648.0, "reward": 0.6375000476837158, "reward_std": 0.03061862289905548, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.23750001192092896, "rewards/format_reward_func/std": 0.07723929733037949, "step": 198 }, { "completion_length": 2507.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5850.0, "completions/max_terminated_length": 5850.0, "completions/mean_length": 2507.416748046875, "completions/mean_terminated_length": 2507.416748046875, "completions/min_length": 1005.0, "completions/min_terminated_length": 1005.0, "epoch": 0.0675033921302578, "frac_reward_zero_std": 0.5, "grad_norm": 0.6426955461502075, "kl": 0.0, "learning_rate": 4.7446514837819186e-07, "loss": -0.0252, "num_tokens": 6785319.0, "reward": 1.0166666507720947, "reward_std": 0.24013885855674744, "rewards/correctness_reward_func/mean": 0.7166666984558105, "rewards/correctness_reward_func/std": 0.4386618733406067, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 199 }, { "completion_length": 1075.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2798.0, "completions/max_terminated_length": 2798.0, "completions/mean_length": 1075.0833740234375, "completions/mean_terminated_length": 1075.0833740234375, "completions/min_length": 456.0, "completions/min_terminated_length": 456.0, "epoch": 0.06784260515603799, "frac_reward_zero_std": 0.5, "grad_norm": 0.05683635175228119, "kl": 0.0, "learning_rate": 4.7429261559696336e-07, "loss": 0.0, "num_tokens": 6809584.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 200 }, { "completion_length": 3697.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 6119.0, "completions/max_terminated_length": 6119.0, "completions/mean_length": 3697.58349609375, "completions/mean_terminated_length": 3697.58349609375, "completions/min_length": 1100.0, "completions/min_terminated_length": 1100.0, "epoch": 0.06818181818181818, "frac_reward_zero_std": 1.0, "grad_norm": 1.3760663364337233e-07, "kl": 0.0, "learning_rate": 4.7412008281573497e-07, "loss": 0.0, "num_tokens": 6863723.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 201 }, { "completion_length": 2568.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4082.0, "completions/max_terminated_length": 4082.0, "completions/mean_length": 2568.916748046875, "completions/mean_terminated_length": 2568.916748046875, "completions/min_length": 1818.0, "completions/min_terminated_length": 1818.0, "epoch": 0.06852103120759837, "frac_reward_zero_std": 0.5, "grad_norm": 0.7128849029541016, "kl": 0.0, "learning_rate": 4.739475500345065e-07, "loss": -0.0046, "num_tokens": 6904072.0, "reward": 1.1041667461395264, "reward_std": 0.23474274575710297, "rewards/correctness_reward_func/mean": 0.8166666030883789, "rewards/correctness_reward_func/std": 0.27579087018966675, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 202 }, { "completion_length": 2055.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3684.0, "completions/max_terminated_length": 3684.0, "completions/mean_length": 2055.416748046875, "completions/mean_terminated_length": 2055.416748046875, "completions/min_length": 790.0, "completions/min_terminated_length": 790.0, "epoch": 0.06886024423337857, "frac_reward_zero_std": 0.0, "grad_norm": 0.10221155732870102, "kl": 0.0, "learning_rate": 4.737750172532781e-07, "loss": 0.0007, "num_tokens": 6943305.0, "reward": 1.0750000476837158, "reward_std": 0.06123722717165947, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 203 }, { "completion_length": 1224.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3263.0, "completions/max_terminated_length": 3263.0, "completions/mean_length": 1224.25, "completions/mean_terminated_length": 1224.25, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "epoch": 0.06919945725915876, "frac_reward_zero_std": 1.0, "grad_norm": 3.1618932894161844e-07, "kl": 0.0, "learning_rate": 4.736024844720496e-07, "loss": 0.0, "num_tokens": 6971046.0, "reward": 1.125, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.22500000894069672, "rewards/format_reward_func/std": 0.07833494991064072, "step": 204 }, { "completion_length": 1673.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3900.0, "completions/max_terminated_length": 3900.0, "completions/mean_length": 1673.8333740234375, "completions/mean_terminated_length": 1673.8333740234375, "completions/min_length": 938.0, "completions/min_terminated_length": 938.0, "epoch": 0.06953867028493894, "frac_reward_zero_std": 1.0, "grad_norm": 3.1687034152128035e-07, "kl": 0.0, "learning_rate": 4.7342995169082123e-07, "loss": 0.0, "num_tokens": 7004194.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 205 }, { "completion_length": 1079.5, "completions/clipped_ratio": 0.33333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4487.0, "completions/mean_length": 3275.83349609375, "completions/mean_terminated_length": 1619.25, "completions/min_length": 581.0, "completions/min_terminated_length": 581.0, "epoch": 0.06987788331071913, "frac_reward_zero_std": 0.0, "grad_norm": 0.931649923324585, "kl": NaN, "learning_rate": 4.732574189095928e-07, "loss": -0.0345, "num_tokens": 7028488.0, "reward": 0.7666666507720947, "reward_std": 0.24533745646476746, "rewards/correctness_reward_func/mean": 0.5666666626930237, "rewards/correctness_reward_func/std": 0.5033223032951355, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.133143812417984, "step": 206 }, { "completion_length": 1299.6667175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 3137.0, "completions/max_terminated_length": 3137.0, "completions/mean_length": 1299.666748046875, "completions/mean_terminated_length": 1299.666748046875, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "epoch": 0.07021709633649932, "frac_reward_zero_std": 0.5, "grad_norm": 0.07907649129629135, "kl": 0.0, "learning_rate": 4.730848861283644e-07, "loss": -0.0023, "num_tokens": 7053222.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 207 }, { "completion_length": 1989.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2931.0, "completions/max_terminated_length": 2931.0, "completions/mean_length": 1989.5, "completions/mean_terminated_length": 1989.5, "completions/min_length": 709.0, "completions/min_terminated_length": 709.0, "epoch": 0.07055630936227951, "frac_reward_zero_std": 0.0, "grad_norm": 2.503600597381592, "kl": 0.0, "learning_rate": 4.7291235334713594e-07, "loss": 0.0031, "num_tokens": 7089882.0, "reward": 0.8500000834465027, "reward_std": 0.2473839521408081, "rewards/correctness_reward_func/mean": 0.550000011920929, "rewards/correctness_reward_func/std": 0.4100997745990753, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 208 }, { "completion_length": 2048.5834350585938, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 5063.0, "completions/mean_length": 3146.75, "completions/mean_terminated_length": 2458.300048828125, "completions/min_length": 1278.0, "completions/min_terminated_length": 1278.0, "epoch": 0.0708955223880597, "frac_reward_zero_std": 0.5, "grad_norm": 6.7276811599731445, "kl": NaN, "learning_rate": 4.727398205659075e-07, "loss": -0.0123, "num_tokens": 7128205.0, "reward": 0.6375000476837158, "reward_std": 0.07373940199613571, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.23750001192092896, "rewards/format_reward_func/std": 0.11894422769546509, "step": 209 }, { "completion_length": 2508.166748046875, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 6519.0, "completions/mean_length": 4155.4169921875, "completions/mean_terminated_length": 3344.22216796875, "completions/min_length": 1361.0, "completions/min_terminated_length": 1361.0, "epoch": 0.07123473541383989, "frac_reward_zero_std": 0.5, "grad_norm": 0.17133860290050507, "kl": NaN, "learning_rate": 4.725672877846791e-07, "loss": -0.0279, "num_tokens": 7167573.0, "reward": 0.6250001192092896, "reward_std": 0.08215838670730591, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.22500000894069672, "rewards/format_reward_func/std": 0.13568010926246643, "step": 210 }, { "completion_length": 1452.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3934.0, "completions/max_terminated_length": 3934.0, "completions/mean_length": 1452.0, "completions/mean_terminated_length": 1452.0, "completions/min_length": 423.0, "completions/min_terminated_length": 423.0, "epoch": 0.07157394843962007, "frac_reward_zero_std": 0.0, "grad_norm": 0.1274157166481018, "kl": 0.0, "learning_rate": 4.723947550034506e-07, "loss": -0.0028, "num_tokens": 7198827.0, "reward": 1.2000000476837158, "reward_std": 0.09350206702947617, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.2500000298023224, "rewards/format_reward_func/std": 0.07385490089654922, "step": 211 }, { "completion_length": 1059.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1710.0, "completions/max_terminated_length": 1710.0, "completions/mean_length": 1059.5833740234375, "completions/mean_terminated_length": 1059.5833740234375, "completions/min_length": 473.0, "completions/min_terminated_length": 473.0, "epoch": 0.07191316146540028, "frac_reward_zero_std": 0.5, "grad_norm": 0.049741536378860474, "kl": 0.0, "learning_rate": 4.722222222222222e-07, "loss": -0.0011, "num_tokens": 7226206.0, "reward": 1.1875, "reward_std": 0.030618613585829735, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 212 }, { "completion_length": 2186.7501220703125, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4565.0, "completions/mean_length": 2735.83349609375, "completions/mean_terminated_length": 2385.54541015625, "completions/min_length": 1381.0, "completions/min_terminated_length": 1381.0, "epoch": 0.07225237449118047, "frac_reward_zero_std": 0.0, "grad_norm": 0.7999017834663391, "kl": NaN, "learning_rate": 4.7204968944099376e-07, "loss": -0.0145, "num_tokens": 7262761.0, "reward": 0.658333420753479, "reward_std": 0.25531625747680664, "rewards/correctness_reward_func/mean": 0.38333332538604736, "rewards/correctness_reward_func/std": 0.4783177673816681, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 213 }, { "completion_length": 2076.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4194.0, "completions/mean_length": 2625.08349609375, "completions/mean_terminated_length": 2264.727294921875, "completions/min_length": 1039.0, "completions/min_terminated_length": 1039.0, "epoch": 0.07259158751696065, "frac_reward_zero_std": 0.0, "grad_norm": 0.8687799572944641, "kl": NaN, "learning_rate": 4.7187715665976537e-07, "loss": -0.02, "num_tokens": 7298377.0, "reward": 0.625, "reward_std": 0.5025304555892944, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4972652792930603, "rewards/format_reward_func/mean": 0.22500000894069672, "rewards/format_reward_func/std": 0.10112998634576797, "step": 214 }, { "completion_length": 1112.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2367.0, "completions/max_terminated_length": 2367.0, "completions/mean_length": 1112.5, "completions/mean_terminated_length": 1112.5, "completions/min_length": 573.0, "completions/min_terminated_length": 573.0, "epoch": 0.07293080054274084, "frac_reward_zero_std": 1.0, "grad_norm": 1.1586325143753129e-07, "kl": 0.0, "learning_rate": 4.7170462387853687e-07, "loss": 0.0, "num_tokens": 7324171.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 215 }, { "completion_length": 1243.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2557.0, "completions/max_terminated_length": 2557.0, "completions/mean_length": 1243.916748046875, "completions/mean_terminated_length": 1243.916748046875, "completions/min_length": 653.0, "completions/min_terminated_length": 653.0, "epoch": 0.07327001356852103, "frac_reward_zero_std": 0.5, "grad_norm": 0.3274558484554291, "kl": 0.0, "learning_rate": 4.7153209109730847e-07, "loss": -0.0002, "num_tokens": 7354998.0, "reward": 1.0208334922790527, "reward_std": 0.19391795992851257, "rewards/correctness_reward_func/mean": 0.7333333492279053, "rewards/correctness_reward_func/std": 0.23094011843204498, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 216 }, { "completion_length": 1607.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4015.0, "completions/max_terminated_length": 4015.0, "completions/mean_length": 1607.0833740234375, "completions/mean_terminated_length": 1607.0833740234375, "completions/min_length": 716.0, "completions/min_terminated_length": 716.0, "epoch": 0.07360922659430122, "frac_reward_zero_std": 0.5, "grad_norm": 0.11406178027391434, "kl": 0.0, "learning_rate": 4.7135955831608e-07, "loss": -0.0028, "num_tokens": 7391251.0, "reward": 0.7333333492279053, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.43333330750465393, "rewards/correctness_reward_func/std": 0.45792683959007263, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 217 }, { "completion_length": 774.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1245.0, "completions/max_terminated_length": 1245.0, "completions/mean_length": 774.5833740234375, "completions/mean_terminated_length": 774.5833740234375, "completions/min_length": 470.0, "completions/min_terminated_length": 470.0, "epoch": 0.07394843962008141, "frac_reward_zero_std": 0.5, "grad_norm": 0.0900874137878418, "kl": 0.0, "learning_rate": 4.7118702553485163e-07, "loss": 0.0, "num_tokens": 7416254.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 218 }, { "completion_length": 1318.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2595.0, "completions/max_terminated_length": 2595.0, "completions/mean_length": 1318.0, "completions/mean_terminated_length": 1318.0, "completions/min_length": 544.0, "completions/min_terminated_length": 544.0, "epoch": 0.0742876526458616, "frac_reward_zero_std": 0.5, "grad_norm": 0.0830475240945816, "kl": 0.0, "learning_rate": 4.7101449275362313e-07, "loss": -0.0019, "num_tokens": 7442162.0, "reward": 1.133333444595337, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 219 }, { "completion_length": 1440.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3651.0, "completions/max_terminated_length": 3651.0, "completions/mean_length": 1440.75, "completions/mean_terminated_length": 1440.75, "completions/min_length": 604.0, "completions/min_terminated_length": 604.0, "epoch": 0.07462686567164178, "frac_reward_zero_std": 0.5, "grad_norm": 0.1543527990579605, "kl": 0.0, "learning_rate": 4.7084195997239474e-07, "loss": -0.0023, "num_tokens": 7466825.0, "reward": 1.2333333492279053, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.9333333373069763, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 220 }, { "completion_length": 1963.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3612.0, "completions/max_terminated_length": 3612.0, "completions/mean_length": 1963.0, "completions/mean_terminated_length": 1963.0, "completions/min_length": 696.0, "completions/min_terminated_length": 696.0, "epoch": 0.07496607869742199, "frac_reward_zero_std": 0.5, "grad_norm": 0.09828822314739227, "kl": 0.0, "learning_rate": 4.706694271911663e-07, "loss": 0.002, "num_tokens": 7500995.0, "reward": 1.120833396911621, "reward_std": 0.04005204886198044, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 221 }, { "completion_length": 1688.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4377.0, "completions/max_terminated_length": 4377.0, "completions/mean_length": 1688.8333740234375, "completions/mean_terminated_length": 1688.8333740234375, "completions/min_length": 370.0, "completions/min_terminated_length": 370.0, "epoch": 0.07530529172320218, "frac_reward_zero_std": 0.5, "grad_norm": 0.280002236366272, "kl": 0.0, "learning_rate": 4.704968944099379e-07, "loss": 0.0004, "num_tokens": 7535055.0, "reward": 0.9333333373069763, "reward_std": 0.20655912160873413, "rewards/correctness_reward_func/mean": 0.6333333253860474, "rewards/correctness_reward_func/std": 0.4735424220561981, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 222 }, { "completion_length": 1618.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2832.0, "completions/max_terminated_length": 2832.0, "completions/mean_length": 1618.666748046875, "completions/mean_terminated_length": 1618.666748046875, "completions/min_length": 1044.0, "completions/min_terminated_length": 1044.0, "epoch": 0.07564450474898236, "frac_reward_zero_std": 0.0, "grad_norm": 0.5872350931167603, "kl": 0.0, "learning_rate": 4.7032436162870945e-07, "loss": 0.0123, "num_tokens": 7569689.0, "reward": 1.087499976158142, "reward_std": 0.3197711706161499, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.38138505816459656, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 223 }, { "completion_length": 1416.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2253.0, "completions/max_terminated_length": 2253.0, "completions/mean_length": 1416.8333740234375, "completions/mean_terminated_length": 1416.8333740234375, "completions/min_length": 550.0, "completions/min_terminated_length": 550.0, "epoch": 0.07598371777476255, "frac_reward_zero_std": 1.0, "grad_norm": 2.0870072603429435e-07, "kl": 0.0, "learning_rate": 4.70151828847481e-07, "loss": 0.0, "num_tokens": 7598295.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 224 }, { "completion_length": 1122.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2544.0, "completions/max_terminated_length": 2544.0, "completions/mean_length": 1122.75, "completions/mean_terminated_length": 1122.75, "completions/min_length": 503.0, "completions/min_terminated_length": 503.0, "epoch": 0.07632293080054274, "frac_reward_zero_std": 0.0, "grad_norm": 0.3819059133529663, "kl": 0.0, "learning_rate": 4.699792960662526e-07, "loss": -0.0098, "num_tokens": 7625124.0, "reward": 1.0833333730697632, "reward_std": 0.2010922133922577, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.2806917726993561, "rewards/format_reward_func/mean": 0.2500000298023224, "rewards/format_reward_func/std": 0.07385490089654922, "step": 225 }, { "completion_length": 2394.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3629.0, "completions/max_terminated_length": 3629.0, "completions/mean_length": 2394.666748046875, "completions/mean_terminated_length": 2394.666748046875, "completions/min_length": 1138.0, "completions/min_terminated_length": 1138.0, "epoch": 0.07666214382632293, "frac_reward_zero_std": 0.0, "grad_norm": 0.9332161545753479, "kl": 0.0, "learning_rate": 4.698067632850241e-07, "loss": 0.0003, "num_tokens": 7666064.0, "reward": 0.9166666865348816, "reward_std": 0.3129710853099823, "rewards/correctness_reward_func/mean": 0.6166666150093079, "rewards/correctness_reward_func/std": 0.4628632962703705, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 226 }, { "completion_length": 864.1667175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 1947.0, "completions/max_terminated_length": 1947.0, "completions/mean_length": 864.1666870117188, "completions/mean_terminated_length": 864.1666870117188, "completions/min_length": 467.0, "completions/min_terminated_length": 467.0, "epoch": 0.07700135685210312, "frac_reward_zero_std": 1.0, "grad_norm": 7.138903868053603e-08, "kl": 0.0, "learning_rate": 4.696342305037957e-07, "loss": 0.0, "num_tokens": 7688386.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 227 }, { "completion_length": 2737.5001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 4239.0, "completions/max_terminated_length": 4239.0, "completions/mean_length": 2737.5, "completions/mean_terminated_length": 2737.5, "completions/min_length": 1432.0, "completions/min_terminated_length": 1432.0, "epoch": 0.0773405698778833, "frac_reward_zero_std": 0.0, "grad_norm": 0.6406996846199036, "kl": 0.0, "learning_rate": 4.6946169772256726e-07, "loss": 0.0023, "num_tokens": 7735372.0, "reward": 0.8000000715255737, "reward_std": 0.24494895339012146, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.4472135901451111, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 228 }, { "completion_length": 1794.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5741.0, "completions/max_terminated_length": 5741.0, "completions/mean_length": 1794.166748046875, "completions/mean_terminated_length": 1794.166748046875, "completions/min_length": 699.0, "completions/min_terminated_length": 699.0, "epoch": 0.0776797829036635, "frac_reward_zero_std": 1.0, "grad_norm": 2.823469458235195e-07, "kl": 0.0, "learning_rate": 4.6928916494133887e-07, "loss": 0.0, "num_tokens": 7763172.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 229 }, { "completion_length": 2012.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3545.0, "completions/max_terminated_length": 3545.0, "completions/mean_length": 2012.166748046875, "completions/mean_terminated_length": 2012.166748046875, "completions/min_length": 992.0, "completions/min_terminated_length": 992.0, "epoch": 0.0780189959294437, "frac_reward_zero_std": 0.5, "grad_norm": 0.5563703775405884, "kl": 0.0, "learning_rate": 4.6911663216011037e-07, "loss": -0.0003, "num_tokens": 7796900.0, "reward": 0.8500000834465027, "reward_std": 0.2345207929611206, "rewards/correctness_reward_func/mean": 0.550000011920929, "rewards/correctness_reward_func/std": 0.4100997745990753, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 230 }, { "completion_length": 1420.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2537.0, "completions/max_terminated_length": 2537.0, "completions/mean_length": 1420.75, "completions/mean_terminated_length": 1420.75, "completions/min_length": 1000.0, "completions/min_terminated_length": 1000.0, "epoch": 0.07835820895522388, "frac_reward_zero_std": 0.5, "grad_norm": 0.374015748500824, "kl": 0.0, "learning_rate": 4.68944099378882e-07, "loss": -0.002, "num_tokens": 7822997.0, "reward": 0.6541666984558105, "reward_std": 0.21588000655174255, "rewards/correctness_reward_func/mean": 0.36666667461395264, "rewards/correctness_reward_func/std": 0.45792683959007263, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 231 }, { "completion_length": 1682.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 4546.0, "completions/max_terminated_length": 4546.0, "completions/mean_length": 1682.166748046875, "completions/mean_terminated_length": 1682.166748046875, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 0.07869742198100407, "frac_reward_zero_std": 0.5, "grad_norm": 0.07696101814508438, "kl": 0.0, "learning_rate": 4.6877156659765353e-07, "loss": -0.0003, "num_tokens": 7851037.0, "reward": 0.7875000834465027, "reward_std": 0.03061862289905548, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 232 }, { "completion_length": 1198.5833740234375, "completions/clipped_ratio": 0.33333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6101.0, "completions/mean_length": 3394.916748046875, "completions/mean_terminated_length": 1797.875, "completions/min_length": 535.0, "completions/min_terminated_length": 535.0, "epoch": 0.07903663500678426, "frac_reward_zero_std": 0.0, "grad_norm": 1.621031403541565, "kl": NaN, "learning_rate": 4.6859903381642513e-07, "loss": -0.0886, "num_tokens": 7876436.0, "reward": 0.7625000476837158, "reward_std": 0.3166946768760681, "rewards/correctness_reward_func/mean": 0.550000011920929, "rewards/correctness_reward_func/std": 0.4100997745990753, "rewards/format_reward_func/mean": 0.21250002086162567, "rewards/format_reward_func/std": 0.13505050539970398, "step": 233 }, { "completion_length": 2415.08349609375, "completions/clipped_ratio": 0.0, "completions/max_length": 5319.0, "completions/max_terminated_length": 5319.0, "completions/mean_length": 2415.08349609375, "completions/mean_terminated_length": 2415.08349609375, "completions/min_length": 1029.0, "completions/min_terminated_length": 1029.0, "epoch": 0.07937584803256445, "frac_reward_zero_std": 0.5, "grad_norm": 0.14514191448688507, "kl": 0.0, "learning_rate": 4.6842650103519663e-07, "loss": -0.0108, "num_tokens": 7914051.0, "reward": 0.7333333492279053, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.43333330750465393, "rewards/correctness_reward_func/std": 0.45792683959007263, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 234 }, { "completion_length": 1313.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3703.0, "completions/max_terminated_length": 3703.0, "completions/mean_length": 1313.75, "completions/mean_terminated_length": 1313.75, "completions/min_length": 428.0, "completions/min_terminated_length": 428.0, "epoch": 0.07971506105834464, "frac_reward_zero_std": 0.0, "grad_norm": 1.3174349069595337, "kl": 0.0, "learning_rate": 4.6825396825396824e-07, "loss": 0.001, "num_tokens": 7941624.0, "reward": 0.7125000357627869, "reward_std": 0.09585144370794296, "rewards/correctness_reward_func/mean": 0.45000001788139343, "rewards/correctness_reward_func/std": 0.47577688097953796, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.06784005463123322, "step": 235 }, { "completion_length": 800.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1525.0, "completions/max_terminated_length": 1525.0, "completions/mean_length": 800.3333740234375, "completions/mean_terminated_length": 800.3333740234375, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.08005427408412483, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.680814354727398e-07, "loss": 0.0, "num_tokens": 7963864.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 236 }, { "completion_length": 2569.166748046875, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 4853.0, "completions/mean_length": 3667.33349609375, "completions/mean_terminated_length": 3083.0, "completions/min_length": 1522.0, "completions/min_terminated_length": 1522.0, "epoch": 0.08039348710990502, "frac_reward_zero_std": 0.0, "grad_norm": 1.3099312782287598, "kl": NaN, "learning_rate": 4.6790890269151135e-07, "loss": -0.0863, "num_tokens": 8008752.0, "reward": 0.9833334684371948, "reward_std": 0.35421618819236755, "rewards/correctness_reward_func/mean": 0.7333332896232605, "rewards/correctness_reward_func/std": 0.3550501763820648, "rewards/format_reward_func/mean": 0.2500000298023224, "rewards/format_reward_func/std": 0.11677484214305878, "step": 237 }, { "completion_length": 883.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1392.0, "completions/max_terminated_length": 1392.0, "completions/mean_length": 883.4166870117188, "completions/mean_terminated_length": 883.4166870117188, "completions/min_length": 577.0, "completions/min_terminated_length": 577.0, "epoch": 0.0807327001356852, "frac_reward_zero_std": 0.0, "grad_norm": 0.08914493024349213, "kl": 0.0, "learning_rate": 4.677363699102829e-07, "loss": 0.001, "num_tokens": 8031389.0, "reward": 0.6916667819023132, "reward_std": 0.07955466210842133, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 238 }, { "completion_length": 1487.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3340.0, "completions/max_terminated_length": 3340.0, "completions/mean_length": 1487.25, "completions/mean_terminated_length": 1487.25, "completions/min_length": 669.0, "completions/min_terminated_length": 669.0, "epoch": 0.0810719131614654, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.675638371290545e-07, "loss": 0.0, "num_tokens": 8063738.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 239 }, { "completion_length": 3740.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5830.0, "completions/max_terminated_length": 5830.0, "completions/mean_length": 3740.416748046875, "completions/mean_terminated_length": 3740.416748046875, "completions/min_length": 1517.0, "completions/min_terminated_length": 1517.0, "epoch": 0.0814111261872456, "frac_reward_zero_std": 0.5, "grad_norm": 0.10733194649219513, "kl": 0.0, "learning_rate": 4.673913043478261e-07, "loss": -0.005, "num_tokens": 8124595.0, "reward": 0.7166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 240 }, { "completion_length": 1372.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3355.0, "completions/max_terminated_length": 3355.0, "completions/mean_length": 1372.8333740234375, "completions/mean_terminated_length": 1372.8333740234375, "completions/min_length": 651.0, "completions/min_terminated_length": 651.0, "epoch": 0.08175033921302578, "frac_reward_zero_std": 1.0, "grad_norm": 2.8452433298298274e-07, "kl": 0.0, "learning_rate": 4.672187715665976e-07, "loss": 0.0, "num_tokens": 8154857.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 241 }, { "completion_length": 2199.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 5927.0, "completions/max_terminated_length": 5927.0, "completions/mean_length": 2199.25, "completions/mean_terminated_length": 2199.25, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "epoch": 0.08208955223880597, "frac_reward_zero_std": 0.5, "grad_norm": 0.19338767230510712, "kl": 0.0, "learning_rate": 4.670462387853692e-07, "loss": -0.0023, "num_tokens": 8192258.0, "reward": 0.7333333492279053, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.43333330750465393, "rewards/correctness_reward_func/std": 0.45792683959007263, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 242 }, { "completion_length": 1042.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 2626.0, "completions/max_terminated_length": 2626.0, "completions/mean_length": 1042.75, "completions/mean_terminated_length": 1042.75, "completions/min_length": 455.0, "completions/min_terminated_length": 455.0, "epoch": 0.08242876526458616, "frac_reward_zero_std": 0.5, "grad_norm": 0.4441406726837158, "kl": 0.0, "learning_rate": 4.6687370600414077e-07, "loss": -0.0016, "num_tokens": 8216441.0, "reward": 1.1000001430511475, "reward_std": 0.20000001788139343, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.2696799635887146, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 243 }, { "completion_length": 1917.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5127.0, "completions/max_terminated_length": 5127.0, "completions/mean_length": 1917.916748046875, "completions/mean_terminated_length": 1917.916748046875, "completions/min_length": 409.0, "completions/min_terminated_length": 409.0, "epoch": 0.08276797829036635, "frac_reward_zero_std": 1.0, "grad_norm": 1.564266796094671e-07, "kl": 0.0, "learning_rate": 4.667011732229124e-07, "loss": 0.0, "num_tokens": 8256160.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 244 }, { "completion_length": 1719.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2907.0, "completions/max_terminated_length": 2907.0, "completions/mean_length": 1719.0833740234375, "completions/mean_terminated_length": 1719.0833740234375, "completions/min_length": 848.0, "completions/min_terminated_length": 848.0, "epoch": 0.08310719131614654, "frac_reward_zero_std": 0.0, "grad_norm": 0.6577960848808289, "kl": 0.0, "learning_rate": 4.665286404416839e-07, "loss": -0.0004, "num_tokens": 8285387.0, "reward": 1.0374999046325684, "reward_std": 0.3044798970222473, "rewards/correctness_reward_func/mean": 0.75, "rewards/correctness_reward_func/std": 0.45226702094078064, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 245 }, { "completion_length": 2581.7501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 4833.0, "completions/max_terminated_length": 4833.0, "completions/mean_length": 2581.75, "completions/mean_terminated_length": 2581.75, "completions/min_length": 904.0, "completions/min_terminated_length": 904.0, "epoch": 0.08344640434192672, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.663561076604555e-07, "loss": 0.0, "num_tokens": 8325866.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 246 }, { "completion_length": 1798.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3059.0, "completions/max_terminated_length": 3059.0, "completions/mean_length": 1798.5, "completions/mean_terminated_length": 1798.5, "completions/min_length": 750.0, "completions/min_terminated_length": 750.0, "epoch": 0.08378561736770691, "frac_reward_zero_std": 1.0, "grad_norm": 3.0002641437931743e-07, "kl": 0.0, "learning_rate": 4.6618357487922703e-07, "loss": 0.0, "num_tokens": 8360000.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 247 }, { "completion_length": 1804.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3874.0, "completions/max_terminated_length": 3874.0, "completions/mean_length": 1804.666748046875, "completions/mean_terminated_length": 1804.666748046875, "completions/min_length": 550.0, "completions/min_terminated_length": 550.0, "epoch": 0.08412483039348712, "frac_reward_zero_std": 0.5, "grad_norm": 0.7278051376342773, "kl": 0.0, "learning_rate": 4.660110420979986e-07, "loss": -0.0089, "num_tokens": 8396068.0, "reward": 0.6333333253860474, "reward_std": 0.25819891691207886, "rewards/correctness_reward_func/mean": 0.3333333432674408, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 248 }, { "completion_length": 1632.5000610351562, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 6421.0, "completions/mean_length": 2730.666748046875, "completions/mean_terminated_length": 1959.0, "completions/min_length": 531.0, "completions/min_terminated_length": 531.0, "epoch": 0.0844640434192673, "frac_reward_zero_std": 0.5, "grad_norm": 2.0582363605499268, "kl": NaN, "learning_rate": 4.6583850931677014e-07, "loss": -0.0822, "num_tokens": 8426572.0, "reward": 0.9833334684371948, "reward_std": 0.3356585204601288, "rewards/correctness_reward_func/mean": 0.7333332896232605, "rewards/correctness_reward_func/std": 0.35505014657974243, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.11677484214305878, "step": 249 }, { "completion_length": 936.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2077.0, "completions/max_terminated_length": 2077.0, "completions/mean_length": 936.4166870117188, "completions/mean_terminated_length": 936.4166870117188, "completions/min_length": 527.0, "completions/min_terminated_length": 527.0, "epoch": 0.08480325644504749, "frac_reward_zero_std": 0.0, "grad_norm": 0.09999001771211624, "kl": 0.0, "learning_rate": 4.6566597653554174e-07, "loss": -0.0015, "num_tokens": 8449605.0, "reward": 1.0916666984558105, "reward_std": 0.07955464720726013, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 250 }, { "completion_length": 1101.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2416.0, "completions/max_terminated_length": 2416.0, "completions/mean_length": 1101.3333740234375, "completions/mean_terminated_length": 1101.3333740234375, "completions/min_length": 439.0, "completions/min_terminated_length": 439.0, "epoch": 0.08514246947082768, "frac_reward_zero_std": 0.5, "grad_norm": 0.057271238416433334, "kl": 0.0, "learning_rate": 4.654934437543133e-07, "loss": 0.0001, "num_tokens": 8478667.0, "reward": 1.0750000476837158, "reward_std": 0.038729824125766754, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 251 }, { "completion_length": 2234.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3281.0, "completions/max_terminated_length": 3281.0, "completions/mean_length": 2234.416748046875, "completions/mean_terminated_length": 2234.416748046875, "completions/min_length": 849.0, "completions/min_terminated_length": 849.0, "epoch": 0.08548168249660787, "frac_reward_zero_std": 1.0, "grad_norm": 3.4393929126963485e-07, "kl": 0.0, "learning_rate": 4.6532091097308485e-07, "loss": 0.0, "num_tokens": 8519382.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 252 }, { "completion_length": 2895.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 5231.0, "completions/max_terminated_length": 5231.0, "completions/mean_length": 2895.83349609375, "completions/mean_terminated_length": 2895.83349609375, "completions/min_length": 1011.0, "completions/min_terminated_length": 1011.0, "epoch": 0.08582089552238806, "frac_reward_zero_std": 0.0, "grad_norm": 0.6422632336616516, "kl": 0.0, "learning_rate": 4.651483781918564e-07, "loss": -0.0129, "num_tokens": 8563696.0, "reward": 0.9500000476837158, "reward_std": 0.2917786240577698, "rewards/correctness_reward_func/mean": 0.6500000357627869, "rewards/correctness_reward_func/std": 0.40113475918769836, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 253 }, { "completion_length": 2033.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3983.0, "completions/max_terminated_length": 3983.0, "completions/mean_length": 2033.416748046875, "completions/mean_terminated_length": 2033.416748046875, "completions/min_length": 619.0, "completions/min_terminated_length": 619.0, "epoch": 0.08616010854816825, "frac_reward_zero_std": 0.5, "grad_norm": 0.05713088810443878, "kl": 0.0, "learning_rate": 4.64975845410628e-07, "loss": -0.0011, "num_tokens": 8598651.0, "reward": 0.6875001788139343, "reward_std": 0.03061862289905548, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 254 }, { "completion_length": 1276.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2436.0, "completions/max_terminated_length": 2436.0, "completions/mean_length": 1276.0, "completions/mean_terminated_length": 1276.0, "completions/min_length": 727.0, "completions/min_terminated_length": 727.0, "epoch": 0.08649932157394843, "frac_reward_zero_std": 1.0, "grad_norm": 3.0110948046058184e-07, "kl": 0.0, "learning_rate": 4.648033126293996e-07, "loss": 0.0, "num_tokens": 8624469.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 255 }, { "completion_length": 2234.0833740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4899.0, "completions/mean_length": 2783.166748046875, "completions/mean_terminated_length": 2437.181884765625, "completions/min_length": 444.0, "completions/min_terminated_length": 444.0, "epoch": 0.08683853459972862, "frac_reward_zero_std": 0.0, "grad_norm": 0.11855963617563248, "kl": NaN, "learning_rate": 4.646307798481711e-07, "loss": -0.0076, "num_tokens": 8665972.0, "reward": 0.7749999761581421, "reward_std": 0.06123725324869156, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 256 }, { "completion_length": 2740.2501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 5840.0, "completions/max_terminated_length": 5840.0, "completions/mean_length": 2740.25, "completions/mean_terminated_length": 2740.25, "completions/min_length": 1029.0, "completions/min_terminated_length": 1029.0, "epoch": 0.08717774762550883, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.644582470669427e-07, "loss": 0.0, "num_tokens": 8714371.0, "reward": 0.30000001192092896, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 257 }, { "completion_length": 3532.8333740234375, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 6254.0, "completions/mean_length": 5180.08349609375, "completions/mean_terminated_length": 4710.4443359375, "completions/min_length": 3218.0, "completions/min_terminated_length": 3218.0, "epoch": 0.08751696065128901, "frac_reward_zero_std": 0.0, "grad_norm": 0.22339831292629242, "kl": NaN, "learning_rate": 4.6428571428571427e-07, "loss": -0.016, "num_tokens": 8766629.0, "reward": 0.6750000715255737, "reward_std": 0.1218542754650116, "rewards/correctness_reward_func/mean": 0.44999995827674866, "rewards/correctness_reward_func/std": 0.47577688097953796, "rewards/format_reward_func/mean": 0.22500000894069672, "rewards/format_reward_func/std": 0.11965861916542053, "step": 258 }, { "completion_length": 1992.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3612.0, "completions/max_terminated_length": 3612.0, "completions/mean_length": 1992.666748046875, "completions/mean_terminated_length": 1992.666748046875, "completions/min_length": 869.0, "completions/min_terminated_length": 869.0, "epoch": 0.0878561736770692, "frac_reward_zero_std": 0.0, "grad_norm": 0.47433897852897644, "kl": 0.0, "learning_rate": 4.641131815044858e-07, "loss": 0.0026, "num_tokens": 8805757.0, "reward": 0.5541666746139526, "reward_std": 0.2371777594089508, "rewards/correctness_reward_func/mean": 0.2666666805744171, "rewards/correctness_reward_func/std": 0.393892765045166, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 259 }, { "completion_length": 2260.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5259.0, "completions/max_terminated_length": 5259.0, "completions/mean_length": 2260.416748046875, "completions/mean_terminated_length": 2260.416748046875, "completions/min_length": 1118.0, "completions/min_terminated_length": 1118.0, "epoch": 0.08819538670284939, "frac_reward_zero_std": 0.5, "grad_norm": 0.11118398606777191, "kl": 0.0, "learning_rate": 4.639406487232574e-07, "loss": 0.0031, "num_tokens": 8846226.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 260 }, { "completion_length": 2100.5001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 3065.0, "completions/max_terminated_length": 3065.0, "completions/mean_length": 2100.5, "completions/mean_terminated_length": 2100.5, "completions/min_length": 1150.0, "completions/min_terminated_length": 1150.0, "epoch": 0.08853459972862958, "frac_reward_zero_std": 0.5, "grad_norm": 0.14715582132339478, "kl": 0.0, "learning_rate": 4.63768115942029e-07, "loss": 0.0013, "num_tokens": 8881794.0, "reward": 1.1041667461395264, "reward_std": 0.05571504682302475, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 261 }, { "completion_length": 1970.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3088.0, "completions/max_terminated_length": 3088.0, "completions/mean_length": 1970.916748046875, "completions/mean_terminated_length": 1970.916748046875, "completions/min_length": 860.0, "completions/min_terminated_length": 860.0, "epoch": 0.08887381275440977, "frac_reward_zero_std": 1.0, "grad_norm": 2.0765658348409488e-07, "kl": 0.0, "learning_rate": 4.6359558316080054e-07, "loss": 0.0, "num_tokens": 8914019.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 262 }, { "completion_length": 1055.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2481.0, "completions/max_terminated_length": 2481.0, "completions/mean_length": 1055.75, "completions/mean_terminated_length": 1055.75, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "epoch": 0.08921302578018996, "frac_reward_zero_std": 0.5, "grad_norm": 0.059661220759153366, "kl": 0.0, "learning_rate": 4.634230503795721e-07, "loss": -0.0, "num_tokens": 8938184.0, "reward": 0.6875001788139343, "reward_std": 0.03061862289905548, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 263 }, { "completion_length": 2159.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3540.0, "completions/max_terminated_length": 3540.0, "completions/mean_length": 2159.666748046875, "completions/mean_terminated_length": 2159.666748046875, "completions/min_length": 879.0, "completions/min_terminated_length": 879.0, "epoch": 0.08955223880597014, "frac_reward_zero_std": 1.0, "grad_norm": 1.261990263401458e-07, "kl": 0.0, "learning_rate": 4.6325051759834364e-07, "loss": 0.0, "num_tokens": 8976838.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 264 }, { "completion_length": 1059.0000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 2702.0, "completions/max_terminated_length": 2702.0, "completions/mean_length": 1059.0, "completions/mean_terminated_length": 1059.0, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 0.08989145183175033, "frac_reward_zero_std": 1.0, "grad_norm": 2.0293516911351617e-07, "kl": 0.0, "learning_rate": 4.6307798481711525e-07, "loss": 0.0, "num_tokens": 9005260.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 265 }, { "completion_length": 1051.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 1806.0, "completions/max_terminated_length": 1806.0, "completions/mean_length": 1051.0, "completions/mean_terminated_length": 1051.0, "completions/min_length": 509.0, "completions/min_terminated_length": 509.0, "epoch": 0.09023066485753053, "frac_reward_zero_std": 0.5, "grad_norm": 0.35058966279029846, "kl": 0.0, "learning_rate": 4.629054520358868e-07, "loss": -0.0082, "num_tokens": 9026614.0, "reward": 0.7833334803581238, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.4833333194255829, "rewards/correctness_reward_func/std": 0.43029236793518066, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 266 }, { "completion_length": 1659.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3826.0, "completions/max_terminated_length": 3826.0, "completions/mean_length": 1659.25, "completions/mean_terminated_length": 1659.25, "completions/min_length": 695.0, "completions/min_terminated_length": 695.0, "epoch": 0.09056987788331072, "frac_reward_zero_std": 0.5, "grad_norm": 0.540415346622467, "kl": 0.0, "learning_rate": 4.6273291925465835e-07, "loss": 0.0041, "num_tokens": 9057775.0, "reward": 1.0833334922790527, "reward_std": 0.19407902657985687, "rewards/correctness_reward_func/mean": 0.7833333015441895, "rewards/correctness_reward_func/std": 0.26227444410324097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 267 }, { "completion_length": 2549.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3837.0, "completions/max_terminated_length": 3837.0, "completions/mean_length": 2549.916748046875, "completions/mean_terminated_length": 2549.916748046875, "completions/min_length": 1170.0, "completions/min_terminated_length": 1170.0, "epoch": 0.09090909090909091, "frac_reward_zero_std": 0.5, "grad_norm": 0.09202612936496735, "kl": 0.0, "learning_rate": 4.625603864734299e-07, "loss": -0.002, "num_tokens": 9100188.0, "reward": 0.7166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 268 }, { "completion_length": 2686.0001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 6176.0, "completions/max_terminated_length": 6176.0, "completions/mean_length": 2686.0, "completions/mean_terminated_length": 2686.0, "completions/min_length": 1286.0, "completions/min_terminated_length": 1286.0, "epoch": 0.0912483039348711, "frac_reward_zero_std": 0.5, "grad_norm": 0.11565620452165604, "kl": 0.0, "learning_rate": 4.623878536922015e-07, "loss": 0.0012, "num_tokens": 9148038.0, "reward": 1.1500000953674316, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.8500000834465027, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 269 }, { "completion_length": 1715.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4031.0, "completions/max_terminated_length": 4031.0, "completions/mean_length": 1715.5833740234375, "completions/mean_terminated_length": 1715.5833740234375, "completions/min_length": 474.0, "completions/min_terminated_length": 474.0, "epoch": 0.09158751696065129, "frac_reward_zero_std": 0.5, "grad_norm": 0.7160343527793884, "kl": 0.0, "learning_rate": 4.6221532091097307e-07, "loss": -0.0202, "num_tokens": 9179917.0, "reward": 0.4333333671092987, "reward_std": 0.20655912160873413, "rewards/correctness_reward_func/mean": 0.13333334028720856, "rewards/correctness_reward_func/std": 0.31139957904815674, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 270 }, { "completion_length": 2181.0, "completions/clipped_ratio": 0.41666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 6022.0, "completions/mean_length": 4926.4169921875, "completions/mean_terminated_length": 3738.857421875, "completions/min_length": 1328.0, "completions/min_terminated_length": 1328.0, "epoch": 0.09192672998643148, "frac_reward_zero_std": 0.5, "grad_norm": 0.20487481355667114, "kl": NaN, "learning_rate": 4.620427881297446e-07, "loss": -0.019, "num_tokens": 9219079.0, "reward": 0.17500001192092896, "reward_std": 0.06123724579811096, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.17500001192092896, "rewards/format_reward_func/std": 0.15447859466075897, "step": 271 }, { "completion_length": 2217.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3837.0, "completions/max_terminated_length": 3837.0, "completions/mean_length": 2217.916748046875, "completions/mean_terminated_length": 2217.916748046875, "completions/min_length": 841.0, "completions/min_terminated_length": 841.0, "epoch": 0.09226594301221167, "frac_reward_zero_std": 0.5, "grad_norm": 0.6833940744400024, "kl": 0.0, "learning_rate": 4.618702553485162e-07, "loss": 0.0011, "num_tokens": 9260148.0, "reward": 0.6000000238418579, "reward_std": 0.23664319515228271, "rewards/correctness_reward_func/mean": 0.29999998211860657, "rewards/correctness_reward_func/std": 0.4472135901451111, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 272 }, { "completion_length": 3103.5001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 4943.0, "completions/max_terminated_length": 4943.0, "completions/mean_length": 3103.5, "completions/mean_terminated_length": 3103.5, "completions/min_length": 2078.0, "completions/min_terminated_length": 2078.0, "epoch": 0.09260515603799185, "frac_reward_zero_std": 0.0, "grad_norm": 0.4680456817150116, "kl": 0.0, "learning_rate": 4.616977225672878e-07, "loss": -0.0159, "num_tokens": 9307752.0, "reward": 1.070833444595337, "reward_std": 0.2576434314250946, "rewards/correctness_reward_func/mean": 0.7833333015441895, "rewards/correctness_reward_func/std": 0.26227444410324097, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 273 }, { "completion_length": 1276.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3375.0, "completions/max_terminated_length": 3375.0, "completions/mean_length": 1276.75, "completions/mean_terminated_length": 1276.75, "completions/min_length": 541.0, "completions/min_terminated_length": 541.0, "epoch": 0.09294436906377204, "frac_reward_zero_std": 1.0, "grad_norm": 2.443483992919937e-07, "kl": 0.0, "learning_rate": 4.6152518978605933e-07, "loss": 0.0, "num_tokens": 9331791.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 274 }, { "completion_length": 2395.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4769.0, "completions/max_terminated_length": 4769.0, "completions/mean_length": 2395.916748046875, "completions/mean_terminated_length": 2395.916748046875, "completions/min_length": 1052.0, "completions/min_terminated_length": 1052.0, "epoch": 0.09328358208955224, "frac_reward_zero_std": 0.0, "grad_norm": 0.8109861612319946, "kl": 0.0, "learning_rate": 4.613526570048309e-07, "loss": 0.0024, "num_tokens": 9368888.0, "reward": 0.6375000476837158, "reward_std": 0.4031320810317993, "rewards/correctness_reward_func/mean": 0.3499999940395355, "rewards/correctness_reward_func/std": 0.4358898997306824, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 275 }, { "completion_length": 2529.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5854.0, "completions/max_terminated_length": 5854.0, "completions/mean_length": 2529.916748046875, "completions/mean_terminated_length": 2529.916748046875, "completions/min_length": 1131.0, "completions/min_terminated_length": 1131.0, "epoch": 0.09362279511533243, "frac_reward_zero_std": 0.0, "grad_norm": 0.44502562284469604, "kl": 0.0, "learning_rate": 4.611801242236025e-07, "loss": 0.0014, "num_tokens": 9407575.0, "reward": 1.0499999523162842, "reward_std": 0.24738392233848572, "rewards/correctness_reward_func/mean": 0.75, "rewards/correctness_reward_func/std": 0.36306774616241455, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 276 }, { "completion_length": 2848.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4154.0, "completions/max_terminated_length": 4154.0, "completions/mean_length": 2848.33349609375, "completions/mean_terminated_length": 2848.33349609375, "completions/min_length": 1579.0, "completions/min_terminated_length": 1579.0, "epoch": 0.09396200814111262, "frac_reward_zero_std": 0.0, "grad_norm": 0.13393691182136536, "kl": 0.0, "learning_rate": 4.6100759144237404e-07, "loss": -0.0004, "num_tokens": 9455825.0, "reward": 0.2625000476837158, "reward_std": 0.06934845447540283, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.06784005463123322, "step": 277 }, { "completion_length": 3776.666748046875, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 6433.0, "completions/mean_length": 5423.9169921875, "completions/mean_terminated_length": 5035.5556640625, "completions/min_length": 3525.0, "completions/min_terminated_length": 3525.0, "epoch": 0.09430122116689281, "frac_reward_zero_std": 0.0, "grad_norm": 0.8137674927711487, "kl": NaN, "learning_rate": 4.608350586611456e-07, "loss": -0.0881, "num_tokens": 9508333.0, "reward": 0.720833420753479, "reward_std": 0.4936787486076355, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.43029236793518066, "rewards/format_reward_func/mean": 0.23750001192092896, "rewards/format_reward_func/std": 0.11894422769546509, "step": 278 }, { "completion_length": 2009.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 5969.0, "completions/max_terminated_length": 5969.0, "completions/mean_length": 2009.3333740234375, "completions/mean_terminated_length": 2009.3333740234375, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "epoch": 0.094640434192673, "frac_reward_zero_std": 0.5, "grad_norm": 0.8695447444915771, "kl": 0.0, "learning_rate": 4.6066252587991715e-07, "loss": 0.0393, "num_tokens": 9539867.0, "reward": 0.5, "reward_std": 0.21908903121948242, "rewards/correctness_reward_func/mean": 0.20000000298023224, "rewards/correctness_reward_func/std": 0.36181360483169556, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 279 }, { "completion_length": 1017.5000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1355.0, "completions/max_terminated_length": 1355.0, "completions/mean_length": 1017.5, "completions/mean_terminated_length": 1017.5, "completions/min_length": 628.0, "completions/min_terminated_length": 628.0, "epoch": 0.09497964721845319, "frac_reward_zero_std": 0.5, "grad_norm": 0.05056990310549736, "kl": 0.0, "learning_rate": 4.6048999309868875e-07, "loss": -0.001, "num_tokens": 9560039.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 280 }, { "completion_length": 2585.0001220703125, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 6297.0, "completions/mean_length": 3683.166748046875, "completions/mean_terminated_length": 3102.0, "completions/min_length": 592.0, "completions/min_terminated_length": 592.0, "epoch": 0.09531886024423337, "frac_reward_zero_std": 0.5, "grad_norm": 0.1040610745549202, "kl": NaN, "learning_rate": 4.6031746031746025e-07, "loss": -0.0198, "num_tokens": 9600029.0, "reward": 0.6625000834465027, "reward_std": 0.06274950504302979, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.09323723614215851, "step": 281 }, { "completion_length": 2685.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 5926.0, "completions/max_terminated_length": 5926.0, "completions/mean_length": 2685.83349609375, "completions/mean_terminated_length": 2685.83349609375, "completions/min_length": 1012.0, "completions/min_terminated_length": 1012.0, "epoch": 0.09565807327001356, "frac_reward_zero_std": 0.0, "grad_norm": 0.6141899824142456, "kl": 0.0, "learning_rate": 4.6014492753623186e-07, "loss": -0.0061, "num_tokens": 9644097.0, "reward": 0.8875000476837158, "reward_std": 0.2698235511779785, "rewards/correctness_reward_func/mean": 0.5999999642372131, "rewards/correctness_reward_func/std": 0.45126086473464966, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 282 }, { "completion_length": 2492.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 6501.0, "completions/max_terminated_length": 6501.0, "completions/mean_length": 2492.58349609375, "completions/mean_terminated_length": 2492.58349609375, "completions/min_length": 702.0, "completions/min_terminated_length": 702.0, "epoch": 0.09599728629579375, "frac_reward_zero_std": 0.5, "grad_norm": 0.10037942975759506, "kl": 0.0, "learning_rate": 4.599723947550034e-07, "loss": -0.0009, "num_tokens": 9686560.0, "reward": 1.1666667461395264, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 283 }, { "completion_length": 1550.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3066.0, "completions/max_terminated_length": 3066.0, "completions/mean_length": 1550.916748046875, "completions/mean_terminated_length": 1550.916748046875, "completions/min_length": 456.0, "completions/min_terminated_length": 456.0, "epoch": 0.09633649932157395, "frac_reward_zero_std": 0.5, "grad_norm": 0.05274030938744545, "kl": 0.0, "learning_rate": 4.59799861973775e-07, "loss": 0.0002, "num_tokens": 9716757.0, "reward": 1.1875, "reward_std": 0.030618613585829735, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 284 }, { "completion_length": 2718.08349609375, "completions/clipped_ratio": 0.0, "completions/max_length": 4146.0, "completions/max_terminated_length": 4146.0, "completions/mean_length": 2718.08349609375, "completions/mean_terminated_length": 2718.08349609375, "completions/min_length": 1100.0, "completions/min_terminated_length": 1100.0, "epoch": 0.09667571234735414, "frac_reward_zero_std": 0.5, "grad_norm": 0.087214395403862, "kl": 0.0, "learning_rate": 4.596273291925465e-07, "loss": -0.002, "num_tokens": 9761284.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 285 }, { "completion_length": 1447.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2123.0, "completions/max_terminated_length": 2123.0, "completions/mean_length": 1447.25, "completions/mean_terminated_length": 1447.25, "completions/min_length": 954.0, "completions/min_terminated_length": 954.0, "epoch": 0.09701492537313433, "frac_reward_zero_std": 1.0, "grad_norm": 3.683128113607381e-07, "kl": 0.0, "learning_rate": 4.594547964113181e-07, "loss": 0.0, "num_tokens": 9792013.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 286 }, { "completion_length": 1051.0833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 2202.0, "completions/max_terminated_length": 2202.0, "completions/mean_length": 1051.0833740234375, "completions/mean_terminated_length": 1051.0833740234375, "completions/min_length": 503.0, "completions/min_terminated_length": 503.0, "epoch": 0.09735413839891452, "frac_reward_zero_std": 1.0, "grad_norm": 1.0335989486520702e-07, "kl": 0.0, "learning_rate": 4.5928226363008973e-07, "loss": 0.0, "num_tokens": 9817568.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 287 }, { "completion_length": 2563.8333740234375, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 5924.0, "completions/mean_length": 4211.08349609375, "completions/mean_terminated_length": 3418.444580078125, "completions/min_length": 1609.0, "completions/min_terminated_length": 1609.0, "epoch": 0.09769335142469471, "frac_reward_zero_std": 0.5, "grad_norm": 0.8279024958610535, "kl": NaN, "learning_rate": 4.591097308488613e-07, "loss": -0.0875, "num_tokens": 9862596.0, "reward": 0.8041667342185974, "reward_std": 0.3116154074668884, "rewards/correctness_reward_func/mean": 0.5666666626930237, "rewards/correctness_reward_func/std": 0.42497774958610535, "rewards/format_reward_func/mean": 0.23750001192092896, "rewards/format_reward_func/std": 0.11894422769546509, "step": 288 }, { "completion_length": 3452.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 5503.0, "completions/max_terminated_length": 5503.0, "completions/mean_length": 3452.83349609375, "completions/mean_terminated_length": 3452.83349609375, "completions/min_length": 2014.0, "completions/min_terminated_length": 2014.0, "epoch": 0.0980325644504749, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.5893719806763283e-07, "loss": 0.0, "num_tokens": 9915352.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 289 }, { "completion_length": 1157.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1837.0, "completions/max_terminated_length": 1837.0, "completions/mean_length": 1157.416748046875, "completions/mean_terminated_length": 1157.416748046875, "completions/min_length": 640.0, "completions/min_terminated_length": 640.0, "epoch": 0.09837177747625508, "frac_reward_zero_std": 0.5, "grad_norm": 0.06177419424057007, "kl": 0.0, "learning_rate": 4.587646652864044e-07, "loss": 0.0001, "num_tokens": 9939849.0, "reward": 1.0875000953674316, "reward_std": 0.030618613585829735, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 290 }, { "completion_length": 595.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1038.0, "completions/max_terminated_length": 1038.0, "completions/mean_length": 595.25, "completions/mean_terminated_length": 595.25, "completions/min_length": 367.0, "completions/min_terminated_length": 367.0, "epoch": 0.09871099050203527, "frac_reward_zero_std": 0.0, "grad_norm": 0.08985080569982529, "kl": 0.0, "learning_rate": 4.58592132505176e-07, "loss": 0.0007, "num_tokens": 9958920.0, "reward": 0.7583333849906921, "reward_std": 0.07955463975667953, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 291 }, { "completion_length": 2860.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5107.0, "completions/max_terminated_length": 5107.0, "completions/mean_length": 2860.666748046875, "completions/mean_terminated_length": 2860.666748046875, "completions/min_length": 847.0, "completions/min_terminated_length": 847.0, "epoch": 0.09905020352781546, "frac_reward_zero_std": 0.0, "grad_norm": 0.6327357292175293, "kl": 0.0, "learning_rate": 4.584195997239475e-07, "loss": -0.0131, "num_tokens": 9998972.0, "reward": 0.7666667699813843, "reward_std": 0.36985844373703003, "rewards/correctness_reward_func/mean": 0.46666669845581055, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 292 }, { "completion_length": 2198.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4135.0, "completions/max_terminated_length": 4135.0, "completions/mean_length": 2198.83349609375, "completions/mean_terminated_length": 2198.83349609375, "completions/min_length": 1079.0, "completions/min_terminated_length": 1079.0, "epoch": 0.09938941655359566, "frac_reward_zero_std": 0.0, "grad_norm": 0.14555297791957855, "kl": 0.0, "learning_rate": 4.582470669427191e-07, "loss": -0.004, "num_tokens": 10040952.0, "reward": 1.1666667461395264, "reward_std": 0.09559705853462219, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 293 }, { "completion_length": 1870.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 5196.0, "completions/max_terminated_length": 5196.0, "completions/mean_length": 1870.0833740234375, "completions/mean_terminated_length": 1870.0833740234375, "completions/min_length": 573.0, "completions/min_terminated_length": 573.0, "epoch": 0.09972862957937585, "frac_reward_zero_std": 0.5, "grad_norm": 0.6394572257995605, "kl": 0.0, "learning_rate": 4.5807453416149065e-07, "loss": 0.0048, "num_tokens": 10071475.0, "reward": 0.7833334803581238, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.4833333194255829, "rewards/correctness_reward_func/std": 0.43029236793518066, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 294 }, { "completion_length": 2610.9166870117188, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 5046.0, "completions/mean_length": 3709.08349609375, "completions/mean_terminated_length": 3133.10009765625, "completions/min_length": 1825.0, "completions/min_terminated_length": 1825.0, "epoch": 0.10006784260515604, "frac_reward_zero_std": 0.0, "grad_norm": 0.9850728511810303, "kl": NaN, "learning_rate": 4.5790200138026226e-07, "loss": -0.0765, "num_tokens": 10118868.0, "reward": 0.8333333730697632, "reward_std": 0.5316232442855835, "rewards/correctness_reward_func/mean": 0.5833333134651184, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.2500000298023224, "rewards/format_reward_func/std": 0.11677484214305878, "step": 295 }, { "completion_length": 3167.666748046875, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 6466.0, "completions/mean_length": 4814.9169921875, "completions/mean_terminated_length": 4223.5556640625, "completions/min_length": 1487.0, "completions/min_terminated_length": 1487.0, "epoch": 0.10040705563093623, "frac_reward_zero_std": 0.5, "grad_norm": 0.3844717741012573, "kl": NaN, "learning_rate": 4.5772946859903376e-07, "loss": -0.0294, "num_tokens": 10169294.0, "reward": 0.6375000476837158, "reward_std": 0.07373940199613571, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.23750001192092896, "rewards/format_reward_func/std": 0.11894422769546509, "step": 296 }, { "completion_length": 2613.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4815.0, "completions/max_terminated_length": 4815.0, "completions/mean_length": 2613.0, "completions/mean_terminated_length": 2613.0, "completions/min_length": 1343.0, "completions/min_terminated_length": 1343.0, "epoch": 0.10074626865671642, "frac_reward_zero_std": 0.5, "grad_norm": 0.12785717844963074, "kl": 0.0, "learning_rate": 4.5755693581780536e-07, "loss": 0.0003, "num_tokens": 10213808.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 297 }, { "completion_length": 1011.9167175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 1775.0, "completions/max_terminated_length": 1775.0, "completions/mean_length": 1011.9166870117188, "completions/mean_terminated_length": 1011.9166870117188, "completions/min_length": 474.0, "completions/min_terminated_length": 474.0, "epoch": 0.1010854816824966, "frac_reward_zero_std": 0.5, "grad_norm": 0.48920974135398865, "kl": 0.0, "learning_rate": 4.573844030365769e-07, "loss": -0.0006, "num_tokens": 10236667.0, "reward": 1.2041666507720947, "reward_std": 0.2002602517604828, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 298 }, { "completion_length": 3067.5001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 6274.0, "completions/max_terminated_length": 6274.0, "completions/mean_length": 3067.5, "completions/mean_terminated_length": 3067.5, "completions/min_length": 1115.0, "completions/min_terminated_length": 1115.0, "epoch": 0.1014246947082768, "frac_reward_zero_std": 0.5, "grad_norm": 0.07078174501657486, "kl": 0.0, "learning_rate": 4.572118702553485e-07, "loss": -0.0003, "num_tokens": 10285699.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 299 }, { "completion_length": 2652.2501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 4168.0, "completions/max_terminated_length": 4168.0, "completions/mean_length": 2652.25, "completions/mean_terminated_length": 2652.25, "completions/min_length": 1339.0, "completions/min_terminated_length": 1339.0, "epoch": 0.10176390773405698, "frac_reward_zero_std": 0.5, "grad_norm": 0.06933386623859406, "kl": 0.0, "learning_rate": 4.5703933747412e-07, "loss": -0.0005, "num_tokens": 10330456.0, "reward": 0.6750000715255737, "reward_std": 0.03872981667518616, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 300 }, { "completion_length": 489.00001525878906, "completions/clipped_ratio": 0.5, "completions/max_length": 6589.0, "completions/max_terminated_length": 1186.0, "completions/mean_length": 3783.5, "completions/mean_terminated_length": 978.0, "completions/min_length": 566.0, "completions/min_terminated_length": 566.0, "epoch": 0.10210312075983717, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": NaN, "learning_rate": 4.568668046928916e-07, "loss": 0.0, "num_tokens": 10348420.0, "reward": 0.15000000596046448, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.15000000596046448, "rewards/format_reward_func/std": 0.15666989982128143, "step": 301 }, { "completion_length": 3680.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 5165.0, "completions/max_terminated_length": 5165.0, "completions/mean_length": 3680.33349609375, "completions/mean_terminated_length": 3680.33349609375, "completions/min_length": 1925.0, "completions/min_terminated_length": 1925.0, "epoch": 0.10244233378561737, "frac_reward_zero_std": 1.0, "grad_norm": 4.457709223970596e-07, "kl": 0.0, "learning_rate": 4.5669427191166323e-07, "loss": 0.0, "num_tokens": 10402940.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 302 }, { "completion_length": 1168.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2076.0, "completions/max_terminated_length": 2076.0, "completions/mean_length": 1168.0, "completions/mean_terminated_length": 1168.0, "completions/min_length": 620.0, "completions/min_terminated_length": 620.0, "epoch": 0.10278154681139756, "frac_reward_zero_std": 0.0, "grad_norm": 0.5187942981719971, "kl": 0.0, "learning_rate": 4.5652173913043473e-07, "loss": -0.0028, "num_tokens": 10434620.0, "reward": 1.0375001430511475, "reward_std": 0.2883797585964203, "rewards/correctness_reward_func/mean": 0.75, "rewards/correctness_reward_func/std": 0.36306774616241455, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 303 }, { "completion_length": 1859.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3710.0, "completions/max_terminated_length": 3710.0, "completions/mean_length": 1859.916748046875, "completions/mean_terminated_length": 1859.916748046875, "completions/min_length": 541.0, "completions/min_terminated_length": 541.0, "epoch": 0.10312075983717775, "frac_reward_zero_std": 0.5, "grad_norm": 0.17270970344543457, "kl": 0.0, "learning_rate": 4.5634920634920634e-07, "loss": -0.0001, "num_tokens": 10468945.0, "reward": 1.1666667461395264, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 304 }, { "completion_length": 2067.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3797.0, "completions/max_terminated_length": 3797.0, "completions/mean_length": 2067.166748046875, "completions/mean_terminated_length": 2067.166748046875, "completions/min_length": 720.0, "completions/min_terminated_length": 720.0, "epoch": 0.10345997286295794, "frac_reward_zero_std": 1.0, "grad_norm": 2.4897084927033575e-07, "kl": 0.0, "learning_rate": 4.561766735679779e-07, "loss": 0.0, "num_tokens": 10505637.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 305 }, { "completion_length": 769.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1071.0, "completions/max_terminated_length": 1071.0, "completions/mean_length": 769.25, "completions/mean_terminated_length": 769.25, "completions/min_length": 633.0, "completions/min_terminated_length": 633.0, "epoch": 0.10379918588873813, "frac_reward_zero_std": 1.0, "grad_norm": 9.078951990204587e-08, "kl": 0.0, "learning_rate": 4.560041407867495e-07, "loss": 0.0, "num_tokens": 10528476.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 306 }, { "completion_length": 2807.5833740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4502.0, "completions/mean_length": 3356.666748046875, "completions/mean_terminated_length": 3062.818359375, "completions/min_length": 1536.0, "completions/min_terminated_length": 1536.0, "epoch": 0.10413839891451832, "frac_reward_zero_std": 0.5, "grad_norm": 0.09662654250860214, "kl": NaN, "learning_rate": 4.55831608005521e-07, "loss": -0.0059, "num_tokens": 10573567.0, "reward": 0.6875001788139343, "reward_std": 0.03061862289905548, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 307 }, { "completion_length": 971.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1409.0, "completions/max_terminated_length": 1409.0, "completions/mean_length": 971.25, "completions/mean_terminated_length": 971.25, "completions/min_length": 642.0, "completions/min_terminated_length": 642.0, "epoch": 0.1044776119402985, "frac_reward_zero_std": 0.5, "grad_norm": 0.07163669914007187, "kl": 0.0, "learning_rate": 4.556590752242926e-07, "loss": 0.0004, "num_tokens": 10592332.0, "reward": 1.1500000953674316, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.8500000834465027, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 308 }, { "completion_length": 3025.5833740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5822.0, "completions/mean_length": 3574.666748046875, "completions/mean_terminated_length": 3300.636474609375, "completions/min_length": 919.0, "completions/min_terminated_length": 919.0, "epoch": 0.10481682496607869, "frac_reward_zero_std": 0.5, "grad_norm": 0.5091751217842102, "kl": NaN, "learning_rate": 4.5548654244306415e-07, "loss": -0.0043, "num_tokens": 10640033.0, "reward": 0.770833432674408, "reward_std": 0.2123773992061615, "rewards/correctness_reward_func/mean": 0.4833333194255829, "rewards/correctness_reward_func/std": 0.43029236793518066, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 309 }, { "completion_length": 1609.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 2961.0, "completions/max_terminated_length": 2961.0, "completions/mean_length": 1609.916748046875, "completions/mean_terminated_length": 1609.916748046875, "completions/min_length": 465.0, "completions/min_terminated_length": 465.0, "epoch": 0.10515603799185888, "frac_reward_zero_std": 0.5, "grad_norm": 0.07517128437757492, "kl": 0.0, "learning_rate": 4.5531400966183576e-07, "loss": -0.0007, "num_tokens": 10670926.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 310 }, { "completion_length": 3589.7501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 4867.0, "completions/max_terminated_length": 4867.0, "completions/mean_length": 3589.75, "completions/mean_terminated_length": 3589.75, "completions/min_length": 2861.0, "completions/min_terminated_length": 2861.0, "epoch": 0.10549525101763908, "frac_reward_zero_std": 0.0, "grad_norm": 0.1443360447883606, "kl": 0.0, "learning_rate": 4.5514147688060726e-07, "loss": -0.0008, "num_tokens": 10727071.0, "reward": 0.7041667699813843, "reward_std": 0.07144345343112946, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 311 }, { "completion_length": 1537.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4027.0, "completions/max_terminated_length": 4027.0, "completions/mean_length": 1537.3333740234375, "completions/mean_terminated_length": 1537.3333740234375, "completions/min_length": 811.0, "completions/min_terminated_length": 811.0, "epoch": 0.10583446404341927, "frac_reward_zero_std": 0.0, "grad_norm": 0.42037126421928406, "kl": 0.0, "learning_rate": 4.5496894409937887e-07, "loss": 0.0139, "num_tokens": 10761491.0, "reward": 1.1666667461395264, "reward_std": 0.2588964104652405, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.287096232175827, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 312 }, { "completion_length": 2531.7501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 4342.0, "completions/max_terminated_length": 4342.0, "completions/mean_length": 2531.75, "completions/mean_terminated_length": 2531.75, "completions/min_length": 874.0, "completions/min_terminated_length": 874.0, "epoch": 0.10617367706919946, "frac_reward_zero_std": 1.0, "grad_norm": 1.324536924585118e-07, "kl": 0.0, "learning_rate": 4.547964113181504e-07, "loss": 0.0, "num_tokens": 10803350.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 313 }, { "completion_length": 3405.416748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6562.0, "completions/mean_length": 3954.5, "completions/mean_terminated_length": 3715.0, "completions/min_length": 1611.0, "completions/min_terminated_length": 1611.0, "epoch": 0.10651289009497965, "frac_reward_zero_std": 0.0, "grad_norm": 0.647424578666687, "kl": NaN, "learning_rate": 4.5462387853692197e-07, "loss": -0.0113, "num_tokens": 10858477.0, "reward": 0.6791666746139526, "reward_std": 0.2734726071357727, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.5149286985397339, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.06784005463123322, "step": 314 }, { "completion_length": 2428.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5417.0, "completions/max_terminated_length": 5417.0, "completions/mean_length": 2428.166748046875, "completions/mean_terminated_length": 2428.166748046875, "completions/min_length": 1073.0, "completions/min_terminated_length": 1073.0, "epoch": 0.10685210312075984, "frac_reward_zero_std": 0.0, "grad_norm": 0.5885283350944519, "kl": 0.0, "learning_rate": 4.544513457556935e-07, "loss": 0.023, "num_tokens": 10903773.0, "reward": 1.1166667938232422, "reward_std": 0.24571877717971802, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.27579089999198914, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 315 }, { "completion_length": 3376.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 6365.0, "completions/max_terminated_length": 6365.0, "completions/mean_length": 3376.166748046875, "completions/mean_terminated_length": 3376.166748046875, "completions/min_length": 1547.0, "completions/min_terminated_length": 1547.0, "epoch": 0.10719131614654002, "frac_reward_zero_std": 0.5, "grad_norm": 0.15940889716148376, "kl": 0.0, "learning_rate": 4.5427881297446513e-07, "loss": -0.0027, "num_tokens": 10957691.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 316 }, { "completion_length": 1784.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4751.0, "completions/max_terminated_length": 4751.0, "completions/mean_length": 1784.8333740234375, "completions/mean_terminated_length": 1784.8333740234375, "completions/min_length": 388.0, "completions/min_terminated_length": 388.0, "epoch": 0.10753052917232021, "frac_reward_zero_std": 0.5, "grad_norm": 0.41797852516174316, "kl": 0.0, "learning_rate": 4.5410628019323674e-07, "loss": 0.0045, "num_tokens": 10988949.0, "reward": 1.0333335399627686, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.7333333492279053, "rewards/correctness_reward_func/std": 0.23094011843204498, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 317 }, { "completion_length": 1927.2501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 3727.0, "completions/max_terminated_length": 3727.0, "completions/mean_length": 1927.25, "completions/mean_terminated_length": 1927.25, "completions/min_length": 823.0, "completions/min_terminated_length": 823.0, "epoch": 0.1078697421981004, "frac_reward_zero_std": 0.5, "grad_norm": 0.07680307328701019, "kl": 0.0, "learning_rate": 4.5393374741200824e-07, "loss": -0.0008, "num_tokens": 11024550.0, "reward": 1.1375000476837158, "reward_std": 0.030618613585829735, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.23750001192092896, "rewards/format_reward_func/std": 0.07723929733037949, "step": 318 }, { "completion_length": 954.4166717529297, "completions/clipped_ratio": 0.41666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 5713.0, "completions/mean_length": 3699.83349609375, "completions/mean_terminated_length": 1636.1429443359375, "completions/min_length": 681.0, "completions/min_terminated_length": 681.0, "epoch": 0.10820895522388059, "frac_reward_zero_std": 0.0, "grad_norm": 0.2716827988624573, "kl": NaN, "learning_rate": 4.5376121463077984e-07, "loss": -0.0151, "num_tokens": 11048717.0, "reward": 0.6833333373069763, "reward_std": 0.10206204652786255, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.133143812417984, "step": 319 }, { "completion_length": 2118.5834350585938, "completions/clipped_ratio": 0.0, "completions/max_length": 6177.0, "completions/max_terminated_length": 6177.0, "completions/mean_length": 2118.58349609375, "completions/mean_terminated_length": 2118.58349609375, "completions/min_length": 778.0, "completions/min_terminated_length": 778.0, "epoch": 0.10854816824966079, "frac_reward_zero_std": 0.0, "grad_norm": 0.16588005423545837, "kl": 0.0, "learning_rate": 4.535886818495514e-07, "loss": 0.0046, "num_tokens": 11085858.0, "reward": 1.1666667461395264, "reward_std": 0.09559707343578339, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 320 }, { "completion_length": 3266.5001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 4822.0, "completions/max_terminated_length": 4822.0, "completions/mean_length": 3266.5, "completions/mean_terminated_length": 3266.5, "completions/min_length": 2462.0, "completions/min_terminated_length": 2462.0, "epoch": 0.10888738127544098, "frac_reward_zero_std": 0.5, "grad_norm": 0.137014701962471, "kl": 0.0, "learning_rate": 4.53416149068323e-07, "loss": 0.0032, "num_tokens": 11136462.0, "reward": 0.75, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.44999995827674866, "rewards/correctness_reward_func/std": 0.47577688097953796, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 321 }, { "completion_length": 2219.0000610351562, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5317.0, "completions/mean_length": 2768.08349609375, "completions/mean_terminated_length": 2420.727294921875, "completions/min_length": 506.0, "completions/min_terminated_length": 506.0, "epoch": 0.10922659430122117, "frac_reward_zero_std": 0.5, "grad_norm": 1.0347485542297363, "kl": NaN, "learning_rate": 4.532436162870945e-07, "loss": -0.049, "num_tokens": 11172702.0, "reward": 0.9375001192092896, "reward_std": 0.28885549306869507, "rewards/correctness_reward_func/mean": 0.6499999761581421, "rewards/correctness_reward_func/std": 0.40113475918769836, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 322 }, { "completion_length": 1779.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3227.0, "completions/max_terminated_length": 3227.0, "completions/mean_length": 1779.75, "completions/mean_terminated_length": 1779.75, "completions/min_length": 648.0, "completions/min_terminated_length": 648.0, "epoch": 0.10956580732700136, "frac_reward_zero_std": 0.5, "grad_norm": 0.34098246693611145, "kl": 0.0, "learning_rate": 4.530710835058661e-07, "loss": -0.0022, "num_tokens": 11204241.0, "reward": 0.36666667461395264, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.06666667014360428, "rewards/correctness_reward_func/std": 0.23094011843204498, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 323 }, { "completion_length": 2422.7501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 4466.0, "completions/max_terminated_length": 4466.0, "completions/mean_length": 2422.75, "completions/mean_terminated_length": 2422.75, "completions/min_length": 853.0, "completions/min_terminated_length": 853.0, "epoch": 0.10990502035278155, "frac_reward_zero_std": 0.5, "grad_norm": 0.12155548483133316, "kl": 0.0, "learning_rate": 4.5289855072463766e-07, "loss": 0.0057, "num_tokens": 11242524.0, "reward": 0.75, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.45000001788139343, "rewards/correctness_reward_func/std": 0.47577688097953796, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 324 }, { "completion_length": 1740.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2459.0, "completions/max_terminated_length": 2459.0, "completions/mean_length": 1740.25, "completions/mean_terminated_length": 1740.25, "completions/min_length": 1336.0, "completions/min_terminated_length": 1336.0, "epoch": 0.11024423337856173, "frac_reward_zero_std": 1.0, "grad_norm": 1.1897661522652925e-07, "kl": 0.0, "learning_rate": 4.527260179434092e-07, "loss": 0.0, "num_tokens": 11277657.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 325 }, { "completion_length": 1775.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 5753.0, "completions/max_terminated_length": 5753.0, "completions/mean_length": 1775.8333740234375, "completions/mean_terminated_length": 1775.8333740234375, "completions/min_length": 460.0, "completions/min_terminated_length": 460.0, "epoch": 0.11058344640434192, "frac_reward_zero_std": 0.0, "grad_norm": 0.16949614882469177, "kl": 0.0, "learning_rate": 4.5255348516218076e-07, "loss": 0.0069, "num_tokens": 11312557.0, "reward": 1.1875, "reward_std": 0.09653984010219574, "rewards/correctness_reward_func/mean": 0.9000000357627869, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 326 }, { "completion_length": 930.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2166.0, "completions/max_terminated_length": 2166.0, "completions/mean_length": 930.0833740234375, "completions/mean_terminated_length": 930.0833740234375, "completions/min_length": 455.0, "completions/min_terminated_length": 455.0, "epoch": 0.11092265943012211, "frac_reward_zero_std": 0.0, "grad_norm": 0.2900066673755646, "kl": 0.0, "learning_rate": 4.5238095238095237e-07, "loss": -0.0033, "num_tokens": 11334668.0, "reward": 1.0375001430511475, "reward_std": 0.23474279046058655, "rewards/correctness_reward_func/mean": 0.75, "rewards/correctness_reward_func/std": 0.24308621883392334, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 327 }, { "completion_length": 2132.3333740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4965.0, "completions/mean_length": 2681.416748046875, "completions/mean_terminated_length": 2326.181884765625, "completions/min_length": 779.0, "completions/min_terminated_length": 779.0, "epoch": 0.1112618724559023, "frac_reward_zero_std": 0.5, "grad_norm": 0.5346319079399109, "kl": NaN, "learning_rate": 4.522084195997239e-07, "loss": -0.0373, "num_tokens": 11372550.0, "reward": 1.008333444595337, "reward_std": 0.22453658282756805, "rewards/correctness_reward_func/mean": 0.7333333492279053, "rewards/correctness_reward_func/std": 0.23094011843204498, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 328 }, { "completion_length": 1133.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 3432.0, "completions/max_terminated_length": 3432.0, "completions/mean_length": 1133.25, "completions/mean_terminated_length": 1133.25, "completions/min_length": 560.0, "completions/min_terminated_length": 560.0, "epoch": 0.1116010854816825, "frac_reward_zero_std": 1.0, "grad_norm": 2.394129978711135e-07, "kl": 0.0, "learning_rate": 4.520358868184955e-07, "loss": 0.0, "num_tokens": 11402877.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 329 }, { "completion_length": 3503.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4891.0, "completions/max_terminated_length": 4891.0, "completions/mean_length": 3503.916748046875, "completions/mean_terminated_length": 3503.916748046875, "completions/min_length": 2531.0, "completions/min_terminated_length": 2531.0, "epoch": 0.11194029850746269, "frac_reward_zero_std": 0.5, "grad_norm": 0.5139081478118896, "kl": 0.0, "learning_rate": 4.5186335403726703e-07, "loss": 0.0014, "num_tokens": 11455586.0, "reward": 1.0333335399627686, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.7333333492279053, "rewards/correctness_reward_func/std": 0.23094011843204498, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 330 }, { "completion_length": 752.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1077.0, "completions/max_terminated_length": 1077.0, "completions/mean_length": 752.1666870117188, "completions/mean_terminated_length": 752.1666870117188, "completions/min_length": 545.0, "completions/min_terminated_length": 545.0, "epoch": 0.11227951153324288, "frac_reward_zero_std": 0.5, "grad_norm": 0.2810387909412384, "kl": 0.0, "learning_rate": 4.5169082125603863e-07, "loss": -0.0016, "num_tokens": 11472094.0, "reward": 1.0208334922790527, "reward_std": 0.19391795992851257, "rewards/correctness_reward_func/mean": 0.7333333492279053, "rewards/correctness_reward_func/std": 0.23094011843204498, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 331 }, { "completion_length": 3109.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 6303.0, "completions/max_terminated_length": 6303.0, "completions/mean_length": 3109.666748046875, "completions/mean_terminated_length": 3109.666748046875, "completions/min_length": 1302.0, "completions/min_terminated_length": 1302.0, "epoch": 0.11261872455902307, "frac_reward_zero_std": 0.5, "grad_norm": 0.49435505270957947, "kl": 0.0, "learning_rate": 4.5151828847481024e-07, "loss": -0.0082, "num_tokens": 11521260.0, "reward": 1.0333335399627686, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.7333333492279053, "rewards/correctness_reward_func/std": 0.23094011843204498, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 332 }, { "completion_length": 2569.25, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5876.0, "completions/mean_length": 3118.33349609375, "completions/mean_terminated_length": 2802.818359375, "completions/min_length": 704.0, "completions/min_terminated_length": 704.0, "epoch": 0.11295793758480326, "frac_reward_zero_std": 0.5, "grad_norm": 1.2554776668548584, "kl": NaN, "learning_rate": 4.5134575569358174e-07, "loss": -0.0563, "num_tokens": 11561421.0, "reward": 0.5125000476837158, "reward_std": 0.3184925317764282, "rewards/correctness_reward_func/mean": 0.25, "rewards/correctness_reward_func/std": 0.45226702094078064, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.09323723614215851, "step": 333 }, { "completion_length": 1469.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2300.0, "completions/max_terminated_length": 2300.0, "completions/mean_length": 1469.916748046875, "completions/mean_terminated_length": 1469.916748046875, "completions/min_length": 763.0, "completions/min_terminated_length": 763.0, "epoch": 0.11329715061058344, "frac_reward_zero_std": 0.5, "grad_norm": 0.08878318965435028, "kl": 0.0, "learning_rate": 4.5117322291235335e-07, "loss": 0.0007, "num_tokens": 11590916.0, "reward": 0.7833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 334 }, { "completion_length": 2060.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3121.0, "completions/max_terminated_length": 3121.0, "completions/mean_length": 2060.33349609375, "completions/mean_terminated_length": 2060.33349609375, "completions/min_length": 733.0, "completions/min_terminated_length": 733.0, "epoch": 0.11363636363636363, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.510006901311249e-07, "loss": 0.0, "num_tokens": 11627238.0, "reward": 0.30000001192092896, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 335 }, { "completion_length": 2561.0001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 3956.0, "completions/max_terminated_length": 3956.0, "completions/mean_length": 2561.0, "completions/mean_terminated_length": 2561.0, "completions/min_length": 1604.0, "completions/min_terminated_length": 1604.0, "epoch": 0.11397557666214382, "frac_reward_zero_std": 0.0, "grad_norm": 1.5772103071212769, "kl": 0.0, "learning_rate": 4.508281573498965e-07, "loss": 0.0067, "num_tokens": 11666136.0, "reward": 1.1500000953674316, "reward_std": 0.2557638883590698, "rewards/correctness_reward_func/mean": 0.8499999642372131, "rewards/correctness_reward_func/std": 0.28444522619247437, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 336 }, { "completion_length": 1338.5833740234375, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 6187.0, "completions/mean_length": 2985.83349609375, "completions/mean_terminated_length": 1784.77783203125, "completions/min_length": 672.0, "completions/min_terminated_length": 672.0, "epoch": 0.11431478968792401, "frac_reward_zero_std": 0.5, "grad_norm": 0.14361168444156647, "kl": NaN, "learning_rate": 4.50655624568668e-07, "loss": -0.0182, "num_tokens": 11693431.0, "reward": 0.6250001192092896, "reward_std": 0.08215838670730591, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.22500000894069672, "rewards/format_reward_func/std": 0.13568010926246643, "step": 337 }, { "completion_length": 1112.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3285.0, "completions/max_terminated_length": 3285.0, "completions/mean_length": 1112.666748046875, "completions/mean_terminated_length": 1112.666748046875, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 0.11465400271370421, "frac_reward_zero_std": 0.5, "grad_norm": 0.3458263874053955, "kl": 0.0, "learning_rate": 4.504830917874396e-07, "loss": -0.0033, "num_tokens": 11719575.0, "reward": 1.0375001430511475, "reward_std": 0.17446348071098328, "rewards/correctness_reward_func/mean": 0.75, "rewards/correctness_reward_func/std": 0.24308621883392334, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 338 }, { "completion_length": 2146.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 5674.0, "completions/max_terminated_length": 5674.0, "completions/mean_length": 2146.58349609375, "completions/mean_terminated_length": 2146.58349609375, "completions/min_length": 412.0, "completions/min_terminated_length": 412.0, "epoch": 0.1149932157394844, "frac_reward_zero_std": 0.0, "grad_norm": 0.5401544570922852, "kl": 0.0, "learning_rate": 4.5031055900621116e-07, "loss": 0.0189, "num_tokens": 11756044.0, "reward": 0.6708333492279053, "reward_std": 0.22469764947891235, "rewards/correctness_reward_func/mean": 0.38333332538604736, "rewards/correctness_reward_func/std": 0.4783177673816681, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 339 }, { "completion_length": 2249.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4573.0, "completions/max_terminated_length": 4573.0, "completions/mean_length": 2249.166748046875, "completions/mean_terminated_length": 2249.166748046875, "completions/min_length": 1230.0, "completions/min_terminated_length": 1230.0, "epoch": 0.11533242876526459, "frac_reward_zero_std": 0.5, "grad_norm": 0.08829537034034729, "kl": 0.0, "learning_rate": 4.501380262249827e-07, "loss": -0.0025, "num_tokens": 11795994.0, "reward": 0.7833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 340 }, { "completion_length": 2166.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 4724.0, "completions/max_terminated_length": 4724.0, "completions/mean_length": 2166.0, "completions/mean_terminated_length": 2166.0, "completions/min_length": 693.0, "completions/min_terminated_length": 693.0, "epoch": 0.11567164179104478, "frac_reward_zero_std": 0.5, "grad_norm": 0.09921236336231232, "kl": 0.0, "learning_rate": 4.4996549344375427e-07, "loss": 0.0037, "num_tokens": 11834874.0, "reward": 1.133333444595337, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 341 }, { "completion_length": 1815.8333740234375, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 5702.0, "completions/mean_length": 3463.08349609375, "completions/mean_terminated_length": 2421.111083984375, "completions/min_length": 956.0, "completions/min_terminated_length": 956.0, "epoch": 0.11601085481682497, "frac_reward_zero_std": 0.5, "grad_norm": 0.7487086653709412, "kl": NaN, "learning_rate": 4.497929606625259e-07, "loss": -0.0992, "num_tokens": 11865634.0, "reward": 0.9750000238418579, "reward_std": 0.35601967573165894, "rewards/correctness_reward_func/mean": 0.75, "rewards/correctness_reward_func/std": 0.45226702094078064, "rewards/format_reward_func/mean": 0.22500000894069672, "rewards/format_reward_func/std": 0.13568010926246643, "step": 342 }, { "completion_length": 1490.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 4809.0, "completions/max_terminated_length": 4809.0, "completions/mean_length": 1490.25, "completions/mean_terminated_length": 1490.25, "completions/min_length": 912.0, "completions/min_terminated_length": 912.0, "epoch": 0.11635006784260515, "frac_reward_zero_std": 0.5, "grad_norm": 0.07796503603458405, "kl": 0.0, "learning_rate": 4.4962042788129743e-07, "loss": 0.0001, "num_tokens": 11891383.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 343 }, { "completion_length": 1805.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3139.0, "completions/max_terminated_length": 3139.0, "completions/mean_length": 1805.3333740234375, "completions/mean_terminated_length": 1805.3333740234375, "completions/min_length": 947.0, "completions/min_terminated_length": 947.0, "epoch": 0.11668928086838534, "frac_reward_zero_std": 1.0, "grad_norm": 2.4056276970441104e-07, "kl": 0.0, "learning_rate": 4.49447895100069e-07, "loss": 0.0, "num_tokens": 11928323.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 344 }, { "completion_length": 2549.83349609375, "completions/clipped_ratio": 0.0, "completions/max_length": 4606.0, "completions/max_terminated_length": 4606.0, "completions/mean_length": 2549.83349609375, "completions/mean_terminated_length": 2549.83349609375, "completions/min_length": 779.0, "completions/min_terminated_length": 779.0, "epoch": 0.11702849389416553, "frac_reward_zero_std": 0.0, "grad_norm": 0.6675385236740112, "kl": 0.0, "learning_rate": 4.4927536231884053e-07, "loss": 0.0182, "num_tokens": 11969691.0, "reward": 1.0500000715255737, "reward_std": 0.20412415266036987, "rewards/correctness_reward_func/mean": 0.75, "rewards/correctness_reward_func/std": 0.24308621883392334, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 345 }, { "completion_length": 3366.666748046875, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 6368.0, "completions/mean_length": 5013.9169921875, "completions/mean_terminated_length": 4488.88916015625, "completions/min_length": 1588.0, "completions/min_terminated_length": 1588.0, "epoch": 0.11736770691994572, "frac_reward_zero_std": 0.0, "grad_norm": 0.836334228515625, "kl": NaN, "learning_rate": 4.4910282953761214e-07, "loss": -0.0777, "num_tokens": 12023327.0, "reward": 0.6041667461395264, "reward_std": 0.29333966970443726, "rewards/correctness_reward_func/mean": 0.36666667461395264, "rewards/correctness_reward_func/std": 0.45792683959007263, "rewards/format_reward_func/mean": 0.23750001192092896, "rewards/format_reward_func/std": 0.11894422769546509, "step": 346 }, { "completion_length": 1315.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 2489.0, "completions/max_terminated_length": 2489.0, "completions/mean_length": 1315.916748046875, "completions/mean_terminated_length": 1315.916748046875, "completions/min_length": 523.0, "completions/min_terminated_length": 523.0, "epoch": 0.11770691994572592, "frac_reward_zero_std": 1.0, "grad_norm": 2.1746159006852395e-07, "kl": 0.0, "learning_rate": 4.4893029675638374e-07, "loss": 0.0, "num_tokens": 12052486.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 347 }, { "completion_length": 3217.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 6038.0, "completions/max_terminated_length": 6038.0, "completions/mean_length": 3217.666748046875, "completions/mean_terminated_length": 3217.666748046875, "completions/min_length": 1073.0, "completions/min_terminated_length": 1073.0, "epoch": 0.11804613297150611, "frac_reward_zero_std": 0.0, "grad_norm": 0.5589861273765564, "kl": 0.0, "learning_rate": 4.4875776397515524e-07, "loss": -0.004, "num_tokens": 12104628.0, "reward": 1.0875000953674316, "reward_std": 0.2607758939266205, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.2696799635887146, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 348 }, { "completion_length": 1836.5834350585938, "completions/clipped_ratio": 0.0, "completions/max_length": 5143.0, "completions/max_terminated_length": 5143.0, "completions/mean_length": 1836.5833740234375, "completions/mean_terminated_length": 1836.5833740234375, "completions/min_length": 686.0, "completions/min_terminated_length": 686.0, "epoch": 0.1183853459972863, "frac_reward_zero_std": 0.5, "grad_norm": 0.10859017074108124, "kl": 0.0, "learning_rate": 4.4858523119392685e-07, "loss": 0.0043, "num_tokens": 12135391.0, "reward": 1.0875000953674316, "reward_std": 0.030618613585829735, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 349 }, { "completion_length": 2148.0834350585938, "completions/clipped_ratio": 0.0, "completions/max_length": 3424.0, "completions/max_terminated_length": 3424.0, "completions/mean_length": 2148.08349609375, "completions/mean_terminated_length": 2148.08349609375, "completions/min_length": 928.0, "completions/min_terminated_length": 928.0, "epoch": 0.11872455902306649, "frac_reward_zero_std": 0.5, "grad_norm": 0.08867096155881882, "kl": 0.0, "learning_rate": 4.484126984126984e-07, "loss": 0.0015, "num_tokens": 12173024.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 350 }, { "completion_length": 1849.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3647.0, "completions/max_terminated_length": 3647.0, "completions/mean_length": 1849.666748046875, "completions/mean_terminated_length": 1849.666748046875, "completions/min_length": 829.0, "completions/min_terminated_length": 829.0, "epoch": 0.11906377204884667, "frac_reward_zero_std": 0.5, "grad_norm": 0.13948985934257507, "kl": 0.0, "learning_rate": 4.4824016563146996e-07, "loss": 0.0055, "num_tokens": 12207754.0, "reward": 1.1500000953674316, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.8500000834465027, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 351 }, { "completion_length": 1228.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 2132.0, "completions/max_terminated_length": 2132.0, "completions/mean_length": 1228.916748046875, "completions/mean_terminated_length": 1228.916748046875, "completions/min_length": 790.0, "completions/min_terminated_length": 790.0, "epoch": 0.11940298507462686, "frac_reward_zero_std": 0.5, "grad_norm": 0.3335708677768707, "kl": 0.0, "learning_rate": 4.480676328502415e-07, "loss": -0.0039, "num_tokens": 12235113.0, "reward": 1.1375000476837158, "reward_std": 0.20600365102291107, "rewards/correctness_reward_func/mean": 0.8499999642372131, "rewards/correctness_reward_func/std": 0.28444522619247437, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 352 }, { "completion_length": 1891.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4444.0, "completions/max_terminated_length": 4444.0, "completions/mean_length": 1891.416748046875, "completions/mean_terminated_length": 1891.416748046875, "completions/min_length": 399.0, "completions/min_terminated_length": 399.0, "epoch": 0.11974219810040705, "frac_reward_zero_std": 0.5, "grad_norm": 0.3772743046283722, "kl": 0.0, "learning_rate": 4.478951000690131e-07, "loss": -0.0017, "num_tokens": 12268598.0, "reward": 1.133333444595337, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.2806917726993561, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 353 }, { "completion_length": 1879.8334350585938, "completions/clipped_ratio": 0.0, "completions/max_length": 5891.0, "completions/max_terminated_length": 5891.0, "completions/mean_length": 1879.8333740234375, "completions/mean_terminated_length": 1879.8333740234375, "completions/min_length": 484.0, "completions/min_terminated_length": 484.0, "epoch": 0.12008141112618724, "frac_reward_zero_std": 0.0, "grad_norm": 0.9070640802383423, "kl": 0.0, "learning_rate": 4.4772256728778467e-07, "loss": -0.0369, "num_tokens": 12300792.0, "reward": 0.3583333492279053, "reward_std": 0.24285396933555603, "rewards/correctness_reward_func/mean": 0.0833333358168602, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 354 }, { "completion_length": 1915.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 4033.0, "completions/max_terminated_length": 4033.0, "completions/mean_length": 1915.0, "completions/mean_terminated_length": 1915.0, "completions/min_length": 814.0, "completions/min_terminated_length": 814.0, "epoch": 0.12042062415196743, "frac_reward_zero_std": 0.5, "grad_norm": 0.09051363170146942, "kl": 0.0, "learning_rate": 4.475500345065562e-07, "loss": -0.0006, "num_tokens": 12334404.0, "reward": 1.1500000953674316, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.8500000834465027, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 355 }, { "completion_length": 1688.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3541.0, "completions/max_terminated_length": 3541.0, "completions/mean_length": 1688.0833740234375, "completions/mean_terminated_length": 1688.0833740234375, "completions/min_length": 497.0, "completions/min_terminated_length": 497.0, "epoch": 0.12075983717774763, "frac_reward_zero_std": 0.5, "grad_norm": 0.0811961218714714, "kl": 0.0, "learning_rate": 4.4737750172532777e-07, "loss": 0.0016, "num_tokens": 12362599.0, "reward": 1.133333444595337, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 356 }, { "completion_length": 1835.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3861.0, "completions/max_terminated_length": 3861.0, "completions/mean_length": 1835.166748046875, "completions/mean_terminated_length": 1835.166748046875, "completions/min_length": 697.0, "completions/min_terminated_length": 697.0, "epoch": 0.12109905020352782, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.472049689440994e-07, "loss": 0.0, "num_tokens": 12398781.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 357 }, { "completion_length": 983.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1866.0, "completions/max_terminated_length": 1866.0, "completions/mean_length": 983.75, "completions/mean_terminated_length": 983.75, "completions/min_length": 584.0, "completions/min_terminated_length": 584.0, "epoch": 0.12143826322930801, "frac_reward_zero_std": 1.0, "grad_norm": 1.9967588116287516e-07, "kl": 0.0, "learning_rate": 4.4703243616287093e-07, "loss": 0.0, "num_tokens": 12422490.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 358 }, { "completion_length": 3115.75, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 6423.0, "completions/mean_length": 4213.9169921875, "completions/mean_terminated_length": 3738.900146484375, "completions/min_length": 1909.0, "completions/min_terminated_length": 1909.0, "epoch": 0.1217774762550882, "frac_reward_zero_std": 0.0, "grad_norm": 0.9891613721847534, "kl": NaN, "learning_rate": 4.468599033816425e-07, "loss": -0.0521, "num_tokens": 12475245.0, "reward": 0.783333420753479, "reward_std": 0.2532995343208313, "rewards/correctness_reward_func/mean": 0.5333333015441895, "rewards/correctness_reward_func/std": 0.47736650705337524, "rewards/format_reward_func/mean": 0.2500000298023224, "rewards/format_reward_func/std": 0.11677484214305878, "step": 359 }, { "completion_length": 2723.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4374.0, "completions/max_terminated_length": 4374.0, "completions/mean_length": 2723.166748046875, "completions/mean_terminated_length": 2723.166748046875, "completions/min_length": 1459.0, "completions/min_terminated_length": 1459.0, "epoch": 0.12211668928086838, "frac_reward_zero_std": 1.0, "grad_norm": 1.2058346499088657e-07, "kl": 0.0, "learning_rate": 4.4668737060041404e-07, "loss": 0.0, "num_tokens": 12520649.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 360 }, { "completion_length": 1992.416748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5684.0, "completions/mean_length": 2541.5, "completions/mean_terminated_length": 2173.54541015625, "completions/min_length": 363.0, "completions/min_terminated_length": 363.0, "epoch": 0.12245590230664857, "frac_reward_zero_std": 0.0, "grad_norm": 0.2540488541126251, "kl": NaN, "learning_rate": 4.4651483781918564e-07, "loss": -0.0095, "num_tokens": 12556066.0, "reward": 0.6791667342185974, "reward_std": 0.10357433557510376, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.09323723614215851, "step": 361 }, { "completion_length": 2420.5834350585938, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5967.0, "completions/mean_length": 2969.666748046875, "completions/mean_terminated_length": 2640.636474609375, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "epoch": 0.12279511533242876, "frac_reward_zero_std": 0.0, "grad_norm": 0.33543863892555237, "kl": NaN, "learning_rate": 4.4634230503795714e-07, "loss": -0.0596, "num_tokens": 12599795.0, "reward": 1.0958333015441895, "reward_std": 0.2968290448188782, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.28069180250167847, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.09323723614215851, "step": 362 }, { "completion_length": 2000.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4281.0, "completions/max_terminated_length": 4281.0, "completions/mean_length": 2000.166748046875, "completions/mean_terminated_length": 2000.166748046875, "completions/min_length": 627.0, "completions/min_terminated_length": 627.0, "epoch": 0.12313432835820895, "frac_reward_zero_std": 0.5, "grad_norm": 0.47814515233039856, "kl": 0.0, "learning_rate": 4.4616977225672875e-07, "loss": 0.0103, "num_tokens": 12634171.0, "reward": 1.066666603088379, "reward_std": 0.20655910670757294, "rewards/correctness_reward_func/mean": 0.7666666507720947, "rewards/correctness_reward_func/std": 0.3700941503047943, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 363 }, { "completion_length": 2920.33349609375, "completions/clipped_ratio": 0.0, "completions/max_length": 6174.0, "completions/max_terminated_length": 6174.0, "completions/mean_length": 2920.33349609375, "completions/mean_terminated_length": 2920.33349609375, "completions/min_length": 636.0, "completions/min_terminated_length": 636.0, "epoch": 0.12347354138398914, "frac_reward_zero_std": 0.0, "grad_norm": 0.5874470472335815, "kl": 0.0, "learning_rate": 4.4599723947550035e-07, "loss": 0.0089, "num_tokens": 12684635.0, "reward": 1.133333444595337, "reward_std": 0.25163978338241577, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.28069180250167847, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 364 }, { "completion_length": 2071.666748046875, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 5161.0, "completions/mean_length": 3169.83349609375, "completions/mean_terminated_length": 2486.0, "completions/min_length": 726.0, "completions/min_terminated_length": 726.0, "epoch": 0.12381275440976934, "frac_reward_zero_std": 0.5, "grad_norm": 0.7931068539619446, "kl": NaN, "learning_rate": 4.458247066942719e-07, "loss": -0.0785, "num_tokens": 12722335.0, "reward": 0.5166666507720947, "reward_std": 0.2840188145637512, "rewards/correctness_reward_func/mean": 0.2666666805744171, "rewards/correctness_reward_func/std": 0.393892765045166, "rewards/format_reward_func/mean": 0.2500000298023224, "rewards/format_reward_func/std": 0.11677484214305878, "step": 365 }, { "completion_length": 544.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 857.0, "completions/max_terminated_length": 857.0, "completions/mean_length": 544.8333740234375, "completions/mean_terminated_length": 544.8333740234375, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "epoch": 0.12415196743554953, "frac_reward_zero_std": 0.5, "grad_norm": 0.2730981707572937, "kl": 0.0, "learning_rate": 4.4565217391304346e-07, "loss": -0.0016, "num_tokens": 12739961.0, "reward": 1.0208334922790527, "reward_std": 0.19391795992851257, "rewards/correctness_reward_func/mean": 0.7333333492279053, "rewards/correctness_reward_func/std": 0.23094011843204498, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 366 }, { "completion_length": 2281.8334350585938, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5693.0, "completions/mean_length": 2830.916748046875, "completions/mean_terminated_length": 2489.272705078125, "completions/min_length": 815.0, "completions/min_terminated_length": 815.0, "epoch": 0.12449118046132972, "frac_reward_zero_std": 0.5, "grad_norm": 0.7804614305496216, "kl": NaN, "learning_rate": 4.45479641131815e-07, "loss": -0.0158, "num_tokens": 12782919.0, "reward": 0.8416666984558105, "reward_std": 0.1855172961950302, "rewards/correctness_reward_func/mean": 0.5666666626930237, "rewards/correctness_reward_func/std": 0.5033223032951355, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 367 }, { "completion_length": 1517.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2935.0, "completions/max_terminated_length": 2935.0, "completions/mean_length": 1517.416748046875, "completions/mean_terminated_length": 1517.416748046875, "completions/min_length": 459.0, "completions/min_terminated_length": 459.0, "epoch": 0.1248303934871099, "frac_reward_zero_std": 1.0, "grad_norm": 1.2498109924763412e-07, "kl": 0.0, "learning_rate": 4.453071083505866e-07, "loss": 0.0, "num_tokens": 12811460.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 368 }, { "completion_length": 2193.5833740234375, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 4981.0, "completions/mean_length": 3291.75, "completions/mean_terminated_length": 2632.300048828125, "completions/min_length": 1154.0, "completions/min_terminated_length": 1154.0, "epoch": 0.1251696065128901, "frac_reward_zero_std": 0.5, "grad_norm": 0.574193000793457, "kl": NaN, "learning_rate": 4.4513457556935817e-07, "loss": -0.0544, "num_tokens": 12849537.0, "reward": 0.9166667461395264, "reward_std": 0.2840188145637512, "rewards/correctness_reward_func/mean": 0.6666666865348816, "rewards/correctness_reward_func/std": 0.31139957904815674, "rewards/format_reward_func/mean": 0.2500000298023224, "rewards/format_reward_func/std": 0.11677484214305878, "step": 369 }, { "completion_length": 2472.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5330.0, "completions/max_terminated_length": 5330.0, "completions/mean_length": 2472.666748046875, "completions/mean_terminated_length": 2472.666748046875, "completions/min_length": 976.0, "completions/min_terminated_length": 976.0, "epoch": 0.1255088195386703, "frac_reward_zero_std": 1.0, "grad_norm": 3.123369936020026e-07, "kl": 0.0, "learning_rate": 4.449620427881297e-07, "loss": 0.0, "num_tokens": 12895625.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 370 }, { "completion_length": 3849.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5591.0, "completions/max_terminated_length": 5591.0, "completions/mean_length": 3849.166748046875, "completions/mean_terminated_length": 3849.166748046875, "completions/min_length": 2576.0, "completions/min_terminated_length": 2576.0, "epoch": 0.12584803256445048, "frac_reward_zero_std": 0.5, "grad_norm": 0.506696343421936, "kl": 0.0, "learning_rate": 4.447895100069013e-07, "loss": -0.0017, "num_tokens": 12953713.0, "reward": 0.8666666746139526, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.5666666626930237, "rewards/correctness_reward_func/std": 0.5033223032951355, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 371 }, { "completion_length": 1875.416748046875, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 5215.0, "completions/mean_length": 3522.666748046875, "completions/mean_terminated_length": 2500.5556640625, "completions/min_length": 1012.0, "completions/min_terminated_length": 1012.0, "epoch": 0.12618724559023067, "frac_reward_zero_std": 0.0, "grad_norm": 0.761985719203949, "kl": NaN, "learning_rate": 4.446169772256729e-07, "loss": -0.085, "num_tokens": 12991038.0, "reward": 0.4916667342185974, "reward_std": 0.4645467698574066, "rewards/correctness_reward_func/mean": 0.2666666805744171, "rewards/correctness_reward_func/std": 0.393892765045166, "rewards/format_reward_func/mean": 0.22500000894069672, "rewards/format_reward_func/std": 0.13568010926246643, "step": 372 }, { "completion_length": 2674.416748046875, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 6334.0, "completions/mean_length": 3772.58349609375, "completions/mean_terminated_length": 3209.300048828125, "completions/min_length": 1402.0, "completions/min_terminated_length": 1402.0, "epoch": 0.12652645861601086, "frac_reward_zero_std": 0.5, "grad_norm": 0.10343494266271591, "kl": NaN, "learning_rate": 4.444444444444444e-07, "loss": -0.0178, "num_tokens": 13036073.0, "reward": 0.25, "reward_std": 0.0774596780538559, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.2500000298023224, "rewards/format_reward_func/std": 0.11677484214305878, "step": 373 }, { "completion_length": 2471.08349609375, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 4921.0, "completions/mean_length": 3569.25, "completions/mean_terminated_length": 2965.300048828125, "completions/min_length": 1494.0, "completions/min_terminated_length": 1494.0, "epoch": 0.12686567164179105, "frac_reward_zero_std": 0.0, "grad_norm": 0.7868385910987854, "kl": NaN, "learning_rate": 4.44271911663216e-07, "loss": -0.0781, "num_tokens": 13080804.0, "reward": 0.783333420753479, "reward_std": 0.4425841271877289, "rewards/correctness_reward_func/mean": 0.5333333611488342, "rewards/correctness_reward_func/std": 0.3938928246498108, "rewards/format_reward_func/mean": 0.2500000298023224, "rewards/format_reward_func/std": 0.11677484214305878, "step": 374 }, { "completion_length": 1423.1666870117188, "completions/clipped_ratio": 0.5, "completions/max_length": 6589.0, "completions/max_terminated_length": 4723.0, "completions/mean_length": 4717.6669921875, "completions/mean_terminated_length": 2846.33349609375, "completions/min_length": 1608.0, "completions/min_terminated_length": 1608.0, "epoch": 0.12720488466757124, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": NaN, "learning_rate": 4.4409937888198754e-07, "loss": 0.0, "num_tokens": 13109138.0, "reward": 0.6499999761581421, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.15000000596046448, "rewards/format_reward_func/std": 0.15666989982128143, "step": 375 }, { "completion_length": 2622.75, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4907.0, "completions/mean_length": 3171.83349609375, "completions/mean_terminated_length": 2861.181884765625, "completions/min_length": 706.0, "completions/min_terminated_length": 706.0, "epoch": 0.12754409769335143, "frac_reward_zero_std": 0.0, "grad_norm": 0.6112473607063293, "kl": NaN, "learning_rate": 4.4392684610075915e-07, "loss": -0.0348, "num_tokens": 13149719.0, "reward": 0.9750000834465027, "reward_std": 0.4121825695037842, "rewards/correctness_reward_func/mean": 0.699999988079071, "rewards/correctness_reward_func/std": 0.3357488214969635, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 376 }, { "completion_length": 2402.0001220703125, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5788.0, "completions/mean_length": 2951.08349609375, "completions/mean_terminated_length": 2620.36376953125, "completions/min_length": 1429.0, "completions/min_terminated_length": 1429.0, "epoch": 0.12788331071913162, "frac_reward_zero_std": 0.0, "grad_norm": 0.2551642656326294, "kl": NaN, "learning_rate": 4.4375431331953065e-07, "loss": -0.0368, "num_tokens": 13188329.0, "reward": 1.0750000476837158, "reward_std": 0.2761763334274292, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.2696799635887146, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 377 }, { "completion_length": 2022.0000610351562, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6286.0, "completions/mean_length": 2571.08349609375, "completions/mean_terminated_length": 2205.818359375, "completions/min_length": 490.0, "completions/min_terminated_length": 490.0, "epoch": 0.1282225237449118, "frac_reward_zero_std": 0.0, "grad_norm": 0.7428491711616516, "kl": NaN, "learning_rate": 4.4358178053830225e-07, "loss": -0.0548, "num_tokens": 13223171.0, "reward": 0.6083333492279053, "reward_std": 0.2925342321395874, "rewards/correctness_reward_func/mean": 0.38333332538604736, "rewards/correctness_reward_func/std": 0.4783177673816681, "rewards/format_reward_func/mean": 0.2250000238418579, "rewards/format_reward_func/std": 0.10112998634576797, "step": 378 }, { "completion_length": 877.5000457763672, "completions/clipped_ratio": 0.33333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4776.0, "completions/mean_length": 3073.83349609375, "completions/mean_terminated_length": 1316.25, "completions/min_length": 463.0, "completions/min_terminated_length": 463.0, "epoch": 0.128561736770692, "frac_reward_zero_std": 0.5, "grad_norm": 0.17597438395023346, "kl": NaN, "learning_rate": 4.4340924775707386e-07, "loss": -0.019, "num_tokens": 13243979.0, "reward": 0.699999988079071, "reward_std": 0.0774596780538559, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.14770980179309845, "step": 379 }, { "completion_length": 1323.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1966.0, "completions/max_terminated_length": 1966.0, "completions/mean_length": 1323.166748046875, "completions/mean_terminated_length": 1323.166748046875, "completions/min_length": 777.0, "completions/min_terminated_length": 777.0, "epoch": 0.12890094979647218, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.432367149758454e-07, "loss": 0.0, "num_tokens": 13267591.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 380 }, { "completion_length": 1451.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3437.0, "completions/max_terminated_length": 3437.0, "completions/mean_length": 1451.8333740234375, "completions/mean_terminated_length": 1451.8333740234375, "completions/min_length": 423.0, "completions/min_terminated_length": 423.0, "epoch": 0.12924016282225237, "frac_reward_zero_std": 0.5, "grad_norm": 0.7834616899490356, "kl": 0.0, "learning_rate": 4.4306418219461696e-07, "loss": -0.0123, "num_tokens": 13297223.0, "reward": 0.8666667342185974, "reward_std": 0.2581988573074341, "rewards/correctness_reward_func/mean": 0.5666666626930237, "rewards/correctness_reward_func/std": 0.42497774958610535, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 381 }, { "completion_length": 1094.0833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 2571.0, "completions/max_terminated_length": 2571.0, "completions/mean_length": 1094.0833740234375, "completions/mean_terminated_length": 1094.0833740234375, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 0.12957937584803256, "frac_reward_zero_std": 1.0, "grad_norm": 2.6127665364583663e-07, "kl": 0.0, "learning_rate": 4.428916494133885e-07, "loss": 0.0, "num_tokens": 13324314.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 382 }, { "completion_length": 2174.916748046875, "completions/clipped_ratio": 0.5, "completions/max_length": 6589.0, "completions/max_terminated_length": 5623.0, "completions/mean_length": 5469.4169921875, "completions/mean_terminated_length": 4349.83349609375, "completions/min_length": 3537.0, "completions/min_terminated_length": 3537.0, "epoch": 0.12991858887381275, "frac_reward_zero_std": 0.5, "grad_norm": 0.7466797232627869, "kl": NaN, "learning_rate": 4.427191166321601e-07, "loss": 0.0017, "num_tokens": 13360529.0, "reward": 0.3999999761581421, "reward_std": 0.27386125922203064, "rewards/correctness_reward_func/mean": 0.25, "rewards/correctness_reward_func/std": 0.45226702094078064, "rewards/format_reward_func/mean": 0.15000000596046448, "rewards/format_reward_func/std": 0.15666989982128143, "step": 383 }, { "completion_length": 2885.0001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 5507.0, "completions/max_terminated_length": 5507.0, "completions/mean_length": 2885.0, "completions/mean_terminated_length": 2885.0, "completions/min_length": 690.0, "completions/min_terminated_length": 690.0, "epoch": 0.13025780189959293, "frac_reward_zero_std": 1.0, "grad_norm": 9.003652934325146e-08, "kl": 0.0, "learning_rate": 4.425465838509316e-07, "loss": 0.0, "num_tokens": 13409909.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 384 }, { "completion_length": 1805.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3747.0, "completions/max_terminated_length": 3747.0, "completions/mean_length": 1805.916748046875, "completions/mean_terminated_length": 1805.916748046875, "completions/min_length": 892.0, "completions/min_terminated_length": 892.0, "epoch": 0.13059701492537312, "frac_reward_zero_std": 0.5, "grad_norm": 0.4168512523174286, "kl": 0.0, "learning_rate": 4.4237405106970323e-07, "loss": -0.0029, "num_tokens": 13443088.0, "reward": 1.0208334922790527, "reward_std": 0.19391795992851257, "rewards/correctness_reward_func/mean": 0.7333333492279053, "rewards/correctness_reward_func/std": 0.23094011843204498, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 385 }, { "completion_length": 1178.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2213.0, "completions/max_terminated_length": 2213.0, "completions/mean_length": 1178.0, "completions/mean_terminated_length": 1178.0, "completions/min_length": 848.0, "completions/min_terminated_length": 848.0, "epoch": 0.1309362279511533, "frac_reward_zero_std": 0.0, "grad_norm": 0.14581476151943207, "kl": 0.0, "learning_rate": 4.422015182884748e-07, "loss": -0.0007, "num_tokens": 13469020.0, "reward": 1.2166666984558105, "reward_std": 0.10641199350357056, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 386 }, { "completion_length": 1169.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1971.0, "completions/max_terminated_length": 1971.0, "completions/mean_length": 1169.8333740234375, "completions/mean_terminated_length": 1169.8333740234375, "completions/min_length": 731.0, "completions/min_terminated_length": 731.0, "epoch": 0.13127544097693353, "frac_reward_zero_std": 0.5, "grad_norm": 0.0867735743522644, "kl": 0.0, "learning_rate": 4.420289855072464e-07, "loss": -0.0019, "num_tokens": 13495550.0, "reward": 0.7666666507720947, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 387 }, { "completion_length": 1863.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 5399.0, "completions/max_terminated_length": 5399.0, "completions/mean_length": 1863.166748046875, "completions/mean_terminated_length": 1863.166748046875, "completions/min_length": 535.0, "completions/min_terminated_length": 535.0, "epoch": 0.13161465400271372, "frac_reward_zero_std": 0.5, "grad_norm": 0.49067234992980957, "kl": 0.0, "learning_rate": 4.418564527260179e-07, "loss": 0.0198, "num_tokens": 13532824.0, "reward": 1.2000000476837158, "reward_std": 0.20000001788139343, "rewards/correctness_reward_func/mean": 0.9000000357627869, "rewards/correctness_reward_func/std": 0.28919950127601624, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 388 }, { "completion_length": 1987.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4437.0, "completions/max_terminated_length": 4437.0, "completions/mean_length": 1987.416748046875, "completions/mean_terminated_length": 1987.416748046875, "completions/min_length": 896.0, "completions/min_terminated_length": 896.0, "epoch": 0.1319538670284939, "frac_reward_zero_std": 0.0, "grad_norm": 0.6929435133934021, "kl": 0.0, "learning_rate": 4.416839199447895e-07, "loss": -0.0, "num_tokens": 13566951.0, "reward": 0.7750000953674316, "reward_std": 0.2602938413619995, "rewards/correctness_reward_func/mean": 0.4999999701976776, "rewards/correctness_reward_func/std": 0.4472135901451111, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 389 }, { "completion_length": 1799.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2813.0, "completions/max_terminated_length": 2813.0, "completions/mean_length": 1799.5, "completions/mean_terminated_length": 1799.5, "completions/min_length": 1107.0, "completions/min_terminated_length": 1107.0, "epoch": 0.1322930800542741, "frac_reward_zero_std": 0.5, "grad_norm": 0.12015525251626968, "kl": 0.0, "learning_rate": 4.4151138716356105e-07, "loss": 0.0012, "num_tokens": 13601895.0, "reward": 1.1500000953674316, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.8499999642372131, "rewards/correctness_reward_func/std": 0.09045340120792389, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 390 }, { "completion_length": 1699.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 4197.0, "completions/max_terminated_length": 4197.0, "completions/mean_length": 1699.416748046875, "completions/mean_terminated_length": 1699.416748046875, "completions/min_length": 847.0, "completions/min_terminated_length": 847.0, "epoch": 0.13263229308005428, "frac_reward_zero_std": 0.5, "grad_norm": 0.39857861399650574, "kl": 0.0, "learning_rate": 4.4133885438233265e-07, "loss": -0.0103, "num_tokens": 13631660.0, "reward": 1.1166667938232422, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.8166666030883789, "rewards/correctness_reward_func/std": 0.27579087018966675, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 391 }, { "completion_length": 1202.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2096.0, "completions/max_terminated_length": 2096.0, "completions/mean_length": 1202.5833740234375, "completions/mean_terminated_length": 1202.5833740234375, "completions/min_length": 643.0, "completions/min_terminated_length": 643.0, "epoch": 0.13297150610583447, "frac_reward_zero_std": 0.0, "grad_norm": 0.3004428446292877, "kl": 0.0, "learning_rate": 4.4116632160110415e-07, "loss": 0.0008, "num_tokens": 13657545.0, "reward": 1.0375001430511475, "reward_std": 0.23474279046058655, "rewards/correctness_reward_func/mean": 0.75, "rewards/correctness_reward_func/std": 0.24308621883392334, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 392 }, { "completion_length": 3542.166748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5556.0, "completions/mean_length": 4091.25, "completions/mean_terminated_length": 3864.181884765625, "completions/min_length": 2148.0, "completions/min_terminated_length": 2148.0, "epoch": 0.13331071913161466, "frac_reward_zero_std": 0.0, "grad_norm": 0.1583491563796997, "kl": NaN, "learning_rate": 4.4099378881987576e-07, "loss": -0.0071, "num_tokens": 13712039.0, "reward": 0.21250002086162567, "reward_std": 0.06934845447540283, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.21250002086162567, "rewards/format_reward_func/std": 0.07723929733037949, "step": 393 }, { "completion_length": 1208.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2981.0, "completions/max_terminated_length": 2981.0, "completions/mean_length": 1208.5, "completions/mean_terminated_length": 1208.5, "completions/min_length": 619.0, "completions/min_terminated_length": 619.0, "epoch": 0.13364993215739485, "frac_reward_zero_std": 0.5, "grad_norm": 0.10130270570516586, "kl": 0.0, "learning_rate": 4.4082125603864736e-07, "loss": 0.0026, "num_tokens": 13737101.0, "reward": 1.133333444595337, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 394 }, { "completion_length": 2585.0001220703125, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5495.0, "completions/mean_length": 3134.08349609375, "completions/mean_terminated_length": 2820.0, "completions/min_length": 1315.0, "completions/min_terminated_length": 1315.0, "epoch": 0.13398914518317503, "frac_reward_zero_std": 0.0, "grad_norm": 0.608137845993042, "kl": NaN, "learning_rate": 4.4064872325741886e-07, "loss": -0.0506, "num_tokens": 13782419.0, "reward": 1.058333396911621, "reward_std": 0.2877541482448578, "rewards/correctness_reward_func/mean": 0.7833333015441895, "rewards/correctness_reward_func/std": 0.26227444410324097, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 395 }, { "completion_length": 3074.166748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5930.0, "completions/mean_length": 3623.25, "completions/mean_terminated_length": 3353.636474609375, "completions/min_length": 1629.0, "completions/min_terminated_length": 1629.0, "epoch": 0.13432835820895522, "frac_reward_zero_std": 0.5, "grad_norm": 0.5104953646659851, "kl": NaN, "learning_rate": 4.4047619047619047e-07, "loss": -0.0287, "num_tokens": 13833343.0, "reward": 0.9416667819023132, "reward_std": 0.24983328580856323, "rewards/correctness_reward_func/mean": 0.6666666865348816, "rewards/correctness_reward_func/std": 0.31139957904815674, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 396 }, { "completion_length": 1857.0833740234375, "completions/clipped_ratio": 0.5833333333333333, "completions/max_length": 6589.0, "completions/max_terminated_length": 5661.0, "completions/mean_length": 5700.6669921875, "completions/mean_terminated_length": 4457.0, "completions/min_length": 3203.0, "completions/min_terminated_length": 3203.0, "epoch": 0.1346675712347354, "frac_reward_zero_std": 0.0, "grad_norm": 0.1551879197359085, "kl": NaN, "learning_rate": 4.40303657694962e-07, "loss": -0.0141, "num_tokens": 13864922.0, "reward": 0.13750001788139343, "reward_std": 0.09185586869716644, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.13750000298023224, "rewards/format_reward_func/std": 0.14943073689937592, "step": 397 }, { "completion_length": 3534.75, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 6351.0, "completions/mean_length": 4632.9169921875, "completions/mean_terminated_length": 4241.7001953125, "completions/min_length": 1820.0, "completions/min_terminated_length": 1820.0, "epoch": 0.1350067842605156, "frac_reward_zero_std": 0.5, "grad_norm": 0.24700386822223663, "kl": NaN, "learning_rate": 4.4013112491373363e-07, "loss": -0.0244, "num_tokens": 13922897.0, "reward": 0.6500000953674316, "reward_std": 0.0774596780538559, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.2500000298023224, "rewards/format_reward_func/std": 0.11677484214305878, "step": 398 }, { "completion_length": 997.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1720.0, "completions/max_terminated_length": 1720.0, "completions/mean_length": 997.1666870117188, "completions/mean_terminated_length": 997.1666870117188, "completions/min_length": 494.0, "completions/min_terminated_length": 494.0, "epoch": 0.1353459972862958, "frac_reward_zero_std": 0.5, "grad_norm": 0.05267763510346413, "kl": 0.0, "learning_rate": 4.3995859213250513e-07, "loss": 0.0, "num_tokens": 13944649.0, "reward": 1.1875, "reward_std": 0.030618607997894287, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 399 }, { "completion_length": 2047.666748046875, "completions/clipped_ratio": 0.33333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6290.0, "completions/mean_length": 4244.0, "completions/mean_terminated_length": 3071.5, "completions/min_length": 1685.0, "completions/min_terminated_length": 1685.0, "epoch": 0.13568521031207598, "frac_reward_zero_std": 0.5, "grad_norm": 0.15240563452243805, "kl": NaN, "learning_rate": 4.3978605935127673e-07, "loss": -0.0224, "num_tokens": 13983681.0, "reward": 0.6250001192092896, "reward_std": 0.06708204001188278, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.22500000894069672, "rewards/format_reward_func/std": 0.11965861916542053, "step": 400 }, { "completion_length": 1873.4166870117188, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6427.0, "completions/mean_length": 2422.5, "completions/mean_terminated_length": 2043.727294921875, "completions/min_length": 580.0, "completions/min_terminated_length": 580.0, "epoch": 0.13602442333785617, "frac_reward_zero_std": 0.5, "grad_norm": 0.0715797170996666, "kl": NaN, "learning_rate": 4.396135265700483e-07, "loss": -0.0117, "num_tokens": 14017424.0, "reward": 0.7749999761581421, "reward_std": 0.06123724579811096, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 401 }, { "completion_length": 2485.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3410.0, "completions/max_terminated_length": 3410.0, "completions/mean_length": 2485.58349609375, "completions/mean_terminated_length": 2485.58349609375, "completions/min_length": 1527.0, "completions/min_terminated_length": 1527.0, "epoch": 0.13636363636363635, "frac_reward_zero_std": 1.0, "grad_norm": 2.7927711698794155e-07, "kl": 0.0, "learning_rate": 4.394409937888199e-07, "loss": 0.0, "num_tokens": 14057679.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 402 }, { "completion_length": 979.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1800.0, "completions/max_terminated_length": 1800.0, "completions/mean_length": 979.1666870117188, "completions/mean_terminated_length": 979.1666870117188, "completions/min_length": 479.0, "completions/min_terminated_length": 479.0, "epoch": 0.13670284938941654, "frac_reward_zero_std": 0.0, "grad_norm": 0.12159288674592972, "kl": 0.0, "learning_rate": 4.392684610075914e-07, "loss": 0.0013, "num_tokens": 14083013.0, "reward": 1.254166603088379, "reward_std": 0.08225837349891663, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 403 }, { "completion_length": 784.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1312.0, "completions/max_terminated_length": 1312.0, "completions/mean_length": 784.6666870117188, "completions/mean_terminated_length": 784.6666870117188, "completions/min_length": 473.0, "completions/min_terminated_length": 473.0, "epoch": 0.13704206241519673, "frac_reward_zero_std": 0.5, "grad_norm": 0.06961729377508163, "kl": 0.0, "learning_rate": 4.39095928226363e-07, "loss": -0.0009, "num_tokens": 14105899.0, "reward": 1.2166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 404 }, { "completion_length": 1378.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2759.0, "completions/max_terminated_length": 2759.0, "completions/mean_length": 1378.8333740234375, "completions/mean_terminated_length": 1378.8333740234375, "completions/min_length": 540.0, "completions/min_terminated_length": 540.0, "epoch": 0.13738127544097695, "frac_reward_zero_std": 0.5, "grad_norm": 0.12275088578462601, "kl": 0.0, "learning_rate": 4.3892339544513455e-07, "loss": -0.0004, "num_tokens": 14135267.0, "reward": 0.7333333492279053, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.43333330750465393, "rewards/correctness_reward_func/std": 0.45792683959007263, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 405 }, { "completion_length": 1384.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2235.0, "completions/max_terminated_length": 2235.0, "completions/mean_length": 1384.666748046875, "completions/mean_terminated_length": 1384.666748046875, "completions/min_length": 715.0, "completions/min_terminated_length": 715.0, "epoch": 0.13772048846675713, "frac_reward_zero_std": 1.0, "grad_norm": 1.1979372516179865e-07, "kl": 0.0, "learning_rate": 4.387508626639061e-07, "loss": 0.0, "num_tokens": 14159677.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 406 }, { "completion_length": 1979.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 5850.0, "completions/max_terminated_length": 5850.0, "completions/mean_length": 1979.0833740234375, "completions/mean_terminated_length": 1979.0833740234375, "completions/min_length": 654.0, "completions/min_terminated_length": 654.0, "epoch": 0.13805970149253732, "frac_reward_zero_std": 0.5, "grad_norm": 0.5809157490730286, "kl": 0.0, "learning_rate": 4.3857832988267766e-07, "loss": 0.0523, "num_tokens": 14196080.0, "reward": 0.9666668176651001, "reward_std": 0.20655912160873413, "rewards/correctness_reward_func/mean": 0.6666666865348816, "rewards/correctness_reward_func/std": 0.31139957904815674, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 407 }, { "completion_length": 2523.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4132.0, "completions/max_terminated_length": 4132.0, "completions/mean_length": 2523.916748046875, "completions/mean_terminated_length": 2523.916748046875, "completions/min_length": 1115.0, "completions/min_terminated_length": 1115.0, "epoch": 0.1383989145183175, "frac_reward_zero_std": 0.5, "grad_norm": 0.13894988596439362, "kl": 0.0, "learning_rate": 4.3840579710144926e-07, "loss": -0.0037, "num_tokens": 14239801.0, "reward": 1.133333444595337, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 408 }, { "completion_length": 1763.7500457763672, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 6448.0, "completions/mean_length": 3411.0, "completions/mean_terminated_length": 2351.666748046875, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "epoch": 0.1387381275440977, "frac_reward_zero_std": 0.0, "grad_norm": 0.8226115107536316, "kl": NaN, "learning_rate": 4.3823326432022087e-07, "loss": -0.1241, "num_tokens": 14273902.0, "reward": 0.7458333969116211, "reward_std": 0.4951653480529785, "rewards/correctness_reward_func/mean": 0.5333333611488342, "rewards/correctness_reward_func/std": 0.3938928246498108, "rewards/format_reward_func/mean": 0.21250002086162567, "rewards/format_reward_func/std": 0.13505050539970398, "step": 409 }, { "completion_length": 1476.4166870117188, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 5934.0, "completions/mean_length": 3123.666748046875, "completions/mean_terminated_length": 1968.5555419921875, "completions/min_length": 708.0, "completions/min_terminated_length": 708.0, "epoch": 0.1390773405698779, "frac_reward_zero_std": 0.5, "grad_norm": 1.1162936687469482, "kl": NaN, "learning_rate": 4.3806073153899237e-07, "loss": -0.082, "num_tokens": 14304171.0, "reward": 0.7583334445953369, "reward_std": 0.27095508575439453, "rewards/correctness_reward_func/mean": 0.5333333611488342, "rewards/correctness_reward_func/std": 0.3938928246498108, "rewards/format_reward_func/mean": 0.22500000894069672, "rewards/format_reward_func/std": 0.13568010926246643, "step": 410 }, { "completion_length": 3505.5833740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6032.0, "completions/mean_length": 4054.666748046875, "completions/mean_terminated_length": 3824.27294921875, "completions/min_length": 1969.0, "completions/min_terminated_length": 1969.0, "epoch": 0.13941655359565808, "frac_reward_zero_std": 0.5, "grad_norm": 0.07667145878076553, "kl": NaN, "learning_rate": 4.3788819875776397e-07, "loss": -0.015, "num_tokens": 14359846.0, "reward": 0.6750000715255737, "reward_std": 0.06123724579811096, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 411 }, { "completion_length": 1065.6667175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 2098.0, "completions/max_terminated_length": 2098.0, "completions/mean_length": 1065.666748046875, "completions/mean_terminated_length": 1065.666748046875, "completions/min_length": 619.0, "completions/min_terminated_length": 619.0, "epoch": 0.13975576662143827, "frac_reward_zero_std": 0.0, "grad_norm": 0.403994083404541, "kl": 0.0, "learning_rate": 4.377156659765355e-07, "loss": 0.0055, "num_tokens": 14381556.0, "reward": 0.9874999523162842, "reward_std": 0.2497076690196991, "rewards/correctness_reward_func/mean": 0.699999988079071, "rewards/correctness_reward_func/std": 0.43064433336257935, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 412 }, { "completion_length": 1031.8333740234375, "completions/clipped_ratio": 0.41666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 6426.0, "completions/mean_length": 3777.25, "completions/mean_terminated_length": 1768.857177734375, "completions/min_length": 713.0, "completions/min_terminated_length": 713.0, "epoch": 0.14009497964721845, "frac_reward_zero_std": 0.0, "grad_norm": 0.6786950826644897, "kl": NaN, "learning_rate": 4.3754313319530713e-07, "loss": -0.0741, "num_tokens": 14406148.0, "reward": 0.7291666865348816, "reward_std": 0.25515520572662354, "rewards/correctness_reward_func/mean": 0.5666666626930237, "rewards/correctness_reward_func/std": 0.5033223032951355, "rewards/format_reward_func/mean": 0.16250000894069672, "rewards/format_reward_func/std": 0.14943073689937592, "step": 413 }, { "completion_length": 2563.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5917.0, "completions/max_terminated_length": 5917.0, "completions/mean_length": 2563.166748046875, "completions/mean_terminated_length": 2563.166748046875, "completions/min_length": 726.0, "completions/min_terminated_length": 726.0, "epoch": 0.14043419267299864, "frac_reward_zero_std": 0.5, "grad_norm": 0.6143797039985657, "kl": 0.0, "learning_rate": 4.3737060041407863e-07, "loss": 0.0111, "num_tokens": 14447670.0, "reward": 0.9541667699813843, "reward_std": 0.22716552019119263, "rewards/correctness_reward_func/mean": 0.6666666865348816, "rewards/correctness_reward_func/std": 0.31139957904815674, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 414 }, { "completion_length": 909.0833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 2567.0, "completions/max_terminated_length": 2567.0, "completions/mean_length": 909.0833740234375, "completions/mean_terminated_length": 909.0833740234375, "completions/min_length": 504.0, "completions/min_terminated_length": 504.0, "epoch": 0.14077340569877883, "frac_reward_zero_std": 1.0, "grad_norm": 2.1539605654652405e-07, "kl": 0.0, "learning_rate": 4.3719806763285024e-07, "loss": 0.0, "num_tokens": 14472895.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 415 }, { "completion_length": 2452.0834350585938, "completions/clipped_ratio": 0.0, "completions/max_length": 5666.0, "completions/max_terminated_length": 5666.0, "completions/mean_length": 2452.08349609375, "completions/mean_terminated_length": 2452.08349609375, "completions/min_length": 695.0, "completions/min_terminated_length": 695.0, "epoch": 0.14111261872455902, "frac_reward_zero_std": 0.0, "grad_norm": 0.4528708755970001, "kl": 0.0, "learning_rate": 4.370255348516218e-07, "loss": -0.0006, "num_tokens": 14513966.0, "reward": 0.9708334803581238, "reward_std": 0.2679903507232666, "rewards/correctness_reward_func/mean": 0.6833333373069763, "rewards/correctness_reward_func/std": 0.32427072525024414, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 416 }, { "completion_length": 695.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1177.0, "completions/max_terminated_length": 1177.0, "completions/mean_length": 695.9166870117188, "completions/mean_terminated_length": 695.9166870117188, "completions/min_length": 431.0, "completions/min_terminated_length": 431.0, "epoch": 0.1414518317503392, "frac_reward_zero_std": 0.5, "grad_norm": 0.090449757874012, "kl": 0.0, "learning_rate": 4.3685300207039334e-07, "loss": 0.0007, "num_tokens": 14531941.0, "reward": 1.2333333492279053, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.9333333373069763, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 417 }, { "completion_length": 2134.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 5264.0, "completions/max_terminated_length": 5264.0, "completions/mean_length": 2134.666748046875, "completions/mean_terminated_length": 2134.666748046875, "completions/min_length": 599.0, "completions/min_terminated_length": 599.0, "epoch": 0.1417910447761194, "frac_reward_zero_std": 0.5, "grad_norm": 0.5045371651649475, "kl": 0.0, "learning_rate": 4.366804692891649e-07, "loss": 0.0213, "num_tokens": 14567709.0, "reward": 0.9000000953674316, "reward_std": 0.21908903121948242, "rewards/correctness_reward_func/mean": 0.6000000238418579, "rewards/correctness_reward_func/std": 0.36181360483169556, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 418 }, { "completion_length": 2361.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4302.0, "completions/max_terminated_length": 4302.0, "completions/mean_length": 2361.166748046875, "completions/mean_terminated_length": 2361.166748046875, "completions/min_length": 1277.0, "completions/min_terminated_length": 1277.0, "epoch": 0.14213025780189958, "frac_reward_zero_std": 0.5, "grad_norm": 0.11170516163110733, "kl": 0.0, "learning_rate": 4.365079365079365e-07, "loss": 0.0027, "num_tokens": 14606201.0, "reward": 1.1666667461395264, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 419 }, { "completion_length": 1313.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2266.0, "completions/max_terminated_length": 2266.0, "completions/mean_length": 1313.75, "completions/mean_terminated_length": 1313.75, "completions/min_length": 760.0, "completions/min_terminated_length": 760.0, "epoch": 0.14246947082767977, "frac_reward_zero_std": 1.0, "grad_norm": 2.5472624542999256e-07, "kl": 0.0, "learning_rate": 4.3633540372670805e-07, "loss": 0.0, "num_tokens": 14639696.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 420 }, { "completion_length": 3634.750244140625, "completions/clipped_ratio": 0.0, "completions/max_length": 5848.0, "completions/max_terminated_length": 5848.0, "completions/mean_length": 3634.75, "completions/mean_terminated_length": 3634.75, "completions/min_length": 1178.0, "completions/min_terminated_length": 1178.0, "epoch": 0.14280868385345996, "frac_reward_zero_std": 0.0, "grad_norm": 1.2318860292434692, "kl": 0.0, "learning_rate": 4.361628709454796e-07, "loss": -0.0021, "num_tokens": 14693759.0, "reward": 0.7583333849906921, "reward_std": 0.40680140256881714, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 421 }, { "completion_length": 2677.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 6421.0, "completions/max_terminated_length": 6421.0, "completions/mean_length": 2677.75, "completions/mean_terminated_length": 2677.75, "completions/min_length": 1236.0, "completions/min_terminated_length": 1236.0, "epoch": 0.14314789687924015, "frac_reward_zero_std": 0.5, "grad_norm": 0.6643301844596863, "kl": 0.0, "learning_rate": 4.3599033816425116e-07, "loss": -0.0141, "num_tokens": 14740298.0, "reward": 0.9000000953674316, "reward_std": 0.21908903121948242, "rewards/correctness_reward_func/mean": 0.6000000238418579, "rewards/correctness_reward_func/std": 0.36181360483169556, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 422 }, { "completion_length": 1688.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2633.0, "completions/max_terminated_length": 2633.0, "completions/mean_length": 1688.0, "completions/mean_terminated_length": 1688.0, "completions/min_length": 949.0, "completions/min_terminated_length": 949.0, "epoch": 0.14348710990502037, "frac_reward_zero_std": 1.0, "grad_norm": 2.688114761895122e-07, "kl": 0.0, "learning_rate": 4.3581780538302277e-07, "loss": 0.0, "num_tokens": 14770574.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 423 }, { "completion_length": 1518.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2738.0, "completions/max_terminated_length": 2738.0, "completions/mean_length": 1518.0, "completions/mean_terminated_length": 1518.0, "completions/min_length": 905.0, "completions/min_terminated_length": 905.0, "epoch": 0.14382632293080055, "frac_reward_zero_std": 0.5, "grad_norm": 0.07617669552564621, "kl": 0.0, "learning_rate": 4.356452726017943e-07, "loss": -0.0, "num_tokens": 14802632.0, "reward": 0.7166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 424 }, { "completion_length": 2456.166748046875, "completions/clipped_ratio": 0.5, "completions/max_length": 6589.0, "completions/max_terminated_length": 6572.0, "completions/mean_length": 5750.6669921875, "completions/mean_terminated_length": 4912.33349609375, "completions/min_length": 1737.0, "completions/min_terminated_length": 1737.0, "epoch": 0.14416553595658074, "frac_reward_zero_std": 0.0, "grad_norm": 0.7163939476013184, "kl": NaN, "learning_rate": 4.3547273982056587e-07, "loss": -0.0691, "num_tokens": 14845864.0, "reward": 0.2958333492279053, "reward_std": 0.29602330923080444, "rewards/correctness_reward_func/mean": 0.13333334028720856, "rewards/correctness_reward_func/std": 0.3113996088504791, "rewards/format_reward_func/mean": 0.16250000894069672, "rewards/format_reward_func/std": 0.14943073689937592, "step": 425 }, { "completion_length": 1030.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1618.0, "completions/max_terminated_length": 1618.0, "completions/mean_length": 1030.8333740234375, "completions/mean_terminated_length": 1030.8333740234375, "completions/min_length": 699.0, "completions/min_terminated_length": 699.0, "epoch": 0.14450474898236093, "frac_reward_zero_std": 0.0, "grad_norm": 0.10956176370382309, "kl": 0.0, "learning_rate": 4.353002070393375e-07, "loss": 0.0006, "num_tokens": 14864090.0, "reward": 1.183333396911621, "reward_std": 0.09246458858251572, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298573225736618, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 426 }, { "completion_length": 919.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3516.0, "completions/max_terminated_length": 3516.0, "completions/mean_length": 919.3333740234375, "completions/mean_terminated_length": 919.3333740234375, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "epoch": 0.14484396200814112, "frac_reward_zero_std": 0.5, "grad_norm": 0.11441392451524734, "kl": 0.0, "learning_rate": 4.3512767425810903e-07, "loss": -0.0054, "num_tokens": 14889018.0, "reward": 0.7166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 427 }, { "completion_length": 3217.2501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 6538.0, "completions/max_terminated_length": 6538.0, "completions/mean_length": 3217.25, "completions/mean_terminated_length": 3217.25, "completions/min_length": 1158.0, "completions/min_terminated_length": 1158.0, "epoch": 0.1451831750339213, "frac_reward_zero_std": 0.5, "grad_norm": 0.20972996950149536, "kl": 0.0, "learning_rate": 4.349551414768806e-07, "loss": -0.0038, "num_tokens": 14938941.0, "reward": 1.1666667461395264, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 428 }, { "completion_length": 2134.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3320.0, "completions/max_terminated_length": 3320.0, "completions/mean_length": 2134.916748046875, "completions/mean_terminated_length": 2134.916748046875, "completions/min_length": 1154.0, "completions/min_terminated_length": 1154.0, "epoch": 0.1455223880597015, "frac_reward_zero_std": 0.5, "grad_norm": 0.10739568620920181, "kl": 0.0, "learning_rate": 4.3478260869565214e-07, "loss": 0.0037, "num_tokens": 14972300.0, "reward": 1.25, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 429 }, { "completion_length": 1214.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3529.0, "completions/max_terminated_length": 3529.0, "completions/mean_length": 1214.75, "completions/mean_terminated_length": 1214.75, "completions/min_length": 545.0, "completions/min_terminated_length": 545.0, "epoch": 0.14586160108548168, "frac_reward_zero_std": 0.0, "grad_norm": 0.3598671853542328, "kl": 0.0, "learning_rate": 4.3461007591442374e-07, "loss": 0.0009, "num_tokens": 15001169.0, "reward": 1.129166603088379, "reward_std": 0.261197566986084, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.21250002086162567, "rewards/format_reward_func/std": 0.07723929733037949, "step": 430 }, { "completion_length": 1586.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3801.0, "completions/max_terminated_length": 3801.0, "completions/mean_length": 1586.666748046875, "completions/mean_terminated_length": 1586.666748046875, "completions/min_length": 387.0, "completions/min_terminated_length": 387.0, "epoch": 0.14620081411126187, "frac_reward_zero_std": 1.0, "grad_norm": 1.477949922445987e-07, "kl": 0.0, "learning_rate": 4.344375431331953e-07, "loss": 0.0, "num_tokens": 15033133.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 431 }, { "completion_length": 859.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1384.0, "completions/max_terminated_length": 1384.0, "completions/mean_length": 859.0, "completions/mean_terminated_length": 859.0, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "epoch": 0.14654002713704206, "frac_reward_zero_std": 0.5, "grad_norm": 0.039414018392562866, "kl": 0.0, "learning_rate": 4.3426501035196685e-07, "loss": -0.001, "num_tokens": 15053815.0, "reward": 0.6875001788139343, "reward_std": 0.03061862289905548, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 432 }, { "completion_length": 740.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1106.0, "completions/max_terminated_length": 1106.0, "completions/mean_length": 740.6666870117188, "completions/mean_terminated_length": 740.6666870117188, "completions/min_length": 488.0, "completions/min_terminated_length": 488.0, "epoch": 0.14687924016282225, "frac_reward_zero_std": 0.0, "grad_norm": 0.07417774200439453, "kl": 0.0, "learning_rate": 4.340924775707384e-07, "loss": -0.0005, "num_tokens": 15074877.0, "reward": 1.2708332538604736, "reward_std": 0.07144343107938766, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 433 }, { "completion_length": 3072.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4774.0, "completions/max_terminated_length": 4774.0, "completions/mean_length": 3072.166748046875, "completions/mean_terminated_length": 3072.166748046875, "completions/min_length": 1584.0, "completions/min_terminated_length": 1584.0, "epoch": 0.14721845318860244, "frac_reward_zero_std": 1.0, "grad_norm": 2.7396959012548905e-07, "kl": 0.0, "learning_rate": 4.3391994478951e-07, "loss": 0.0, "num_tokens": 15122993.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 434 }, { "completion_length": 1638.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2871.0, "completions/max_terminated_length": 2871.0, "completions/mean_length": 1638.3333740234375, "completions/mean_terminated_length": 1638.3333740234375, "completions/min_length": 621.0, "completions/min_terminated_length": 621.0, "epoch": 0.14755766621438263, "frac_reward_zero_std": 1.0, "grad_norm": 2.3176340846475796e-07, "kl": 0.0, "learning_rate": 4.3374741200828156e-07, "loss": 0.0, "num_tokens": 15153723.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 435 }, { "completion_length": 2581.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5435.0, "completions/max_terminated_length": 5435.0, "completions/mean_length": 2581.0, "completions/mean_terminated_length": 2581.0, "completions/min_length": 1202.0, "completions/min_terminated_length": 1202.0, "epoch": 0.14789687924016282, "frac_reward_zero_std": 0.5, "grad_norm": 0.1409706026315689, "kl": 0.0, "learning_rate": 4.335748792270531e-07, "loss": -0.0058, "num_tokens": 15193863.0, "reward": 1.1500000953674316, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.8500000834465027, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 436 }, { "completion_length": 3028.5833740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5637.0, "completions/mean_length": 3577.666748046875, "completions/mean_terminated_length": 3303.9091796875, "completions/min_length": 1284.0, "completions/min_terminated_length": 1284.0, "epoch": 0.148236092265943, "frac_reward_zero_std": 0.0, "grad_norm": 0.1644480973482132, "kl": NaN, "learning_rate": 4.3340234644582466e-07, "loss": -0.0131, "num_tokens": 15246178.0, "reward": 0.7416666746139526, "reward_std": 0.1128769963979721, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 437 }, { "completion_length": 3253.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5697.0, "completions/max_terminated_length": 5697.0, "completions/mean_length": 3253.0, "completions/mean_terminated_length": 3253.0, "completions/min_length": 1295.0, "completions/min_terminated_length": 1295.0, "epoch": 0.1485753052917232, "frac_reward_zero_std": 0.5, "grad_norm": 0.13437378406524658, "kl": 0.0, "learning_rate": 4.3322981366459627e-07, "loss": 0.001, "num_tokens": 15299776.0, "reward": 1.25, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 438 }, { "completion_length": 1089.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2189.0, "completions/max_terminated_length": 2189.0, "completions/mean_length": 1089.75, "completions/mean_terminated_length": 1089.75, "completions/min_length": 693.0, "completions/min_terminated_length": 693.0, "epoch": 0.14891451831750338, "frac_reward_zero_std": 0.0, "grad_norm": 0.4965176284313202, "kl": 0.0, "learning_rate": 4.3305728088336777e-07, "loss": 0.0096, "num_tokens": 15323659.0, "reward": 1.1000001430511475, "reward_std": 0.23782965540885925, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.2696799635887146, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 439 }, { "completion_length": 1152.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2723.0, "completions/max_terminated_length": 2723.0, "completions/mean_length": 1152.75, "completions/mean_terminated_length": 1152.75, "completions/min_length": 355.0, "completions/min_terminated_length": 355.0, "epoch": 0.14925373134328357, "frac_reward_zero_std": 0.5, "grad_norm": 0.04821237921714783, "kl": 0.0, "learning_rate": 4.328847481021394e-07, "loss": -0.0003, "num_tokens": 15347458.0, "reward": 1.1875, "reward_std": 0.030618607997894287, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 440 }, { "completion_length": 2592.0001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 4795.0, "completions/max_terminated_length": 4795.0, "completions/mean_length": 2592.0, "completions/mean_terminated_length": 2592.0, "completions/min_length": 818.0, "completions/min_terminated_length": 818.0, "epoch": 0.14959294436906379, "frac_reward_zero_std": 1.0, "grad_norm": 2.51977240850465e-07, "kl": 0.0, "learning_rate": 4.32712215320911e-07, "loss": 0.0, "num_tokens": 15391168.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 441 }, { "completion_length": 2777.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4143.0, "completions/max_terminated_length": 4143.0, "completions/mean_length": 2777.666748046875, "completions/mean_terminated_length": 2777.666748046875, "completions/min_length": 1074.0, "completions/min_terminated_length": 1074.0, "epoch": 0.14993215739484397, "frac_reward_zero_std": 0.5, "grad_norm": 0.47685208916664124, "kl": 0.0, "learning_rate": 4.3253968253968253e-07, "loss": 0.0155, "num_tokens": 15437178.0, "reward": 1.0208334922790527, "reward_std": 0.19391795992851257, "rewards/correctness_reward_func/mean": 0.7333333492279053, "rewards/correctness_reward_func/std": 0.2309401035308838, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 442 }, { "completion_length": 1874.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4018.0, "completions/max_terminated_length": 4018.0, "completions/mean_length": 1874.3333740234375, "completions/mean_terminated_length": 1874.3333740234375, "completions/min_length": 866.0, "completions/min_terminated_length": 866.0, "epoch": 0.15027137042062416, "frac_reward_zero_std": 0.5, "grad_norm": 0.1033775582909584, "kl": 0.0, "learning_rate": 4.323671497584541e-07, "loss": -0.0001, "num_tokens": 15471610.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 443 }, { "completion_length": 1630.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4842.0, "completions/max_terminated_length": 4842.0, "completions/mean_length": 1630.0833740234375, "completions/mean_terminated_length": 1630.0833740234375, "completions/min_length": 564.0, "completions/min_terminated_length": 564.0, "epoch": 0.15061058344640435, "frac_reward_zero_std": 0.5, "grad_norm": 0.08019285649061203, "kl": 0.0, "learning_rate": 4.3219461697722564e-07, "loss": -0.0026, "num_tokens": 15504059.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 444 }, { "completion_length": 1034.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2473.0, "completions/max_terminated_length": 2473.0, "completions/mean_length": 1034.25, "completions/mean_terminated_length": 1034.25, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "epoch": 0.15094979647218454, "frac_reward_zero_std": 1.0, "grad_norm": 1.2533315896234853e-07, "kl": 0.0, "learning_rate": 4.3202208419599725e-07, "loss": 0.0, "num_tokens": 15531314.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 445 }, { "completion_length": 1931.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 4648.0, "completions/max_terminated_length": 4648.0, "completions/mean_length": 1931.166748046875, "completions/mean_terminated_length": 1931.166748046875, "completions/min_length": 519.0, "completions/min_terminated_length": 519.0, "epoch": 0.15128900949796473, "frac_reward_zero_std": 0.5, "grad_norm": 0.7577066421508789, "kl": 0.0, "learning_rate": 4.318495514147688e-07, "loss": 0.0376, "num_tokens": 15569032.0, "reward": 0.833333432674408, "reward_std": 0.20655912160873413, "rewards/correctness_reward_func/mean": 0.5333333611488342, "rewards/correctness_reward_func/std": 0.3938928246498108, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 446 }, { "completion_length": 2384.5001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 4492.0, "completions/max_terminated_length": 4492.0, "completions/mean_length": 2384.5, "completions/mean_terminated_length": 2384.5, "completions/min_length": 647.0, "completions/min_terminated_length": 647.0, "epoch": 0.15162822252374492, "frac_reward_zero_std": 0.5, "grad_norm": 0.2805997133255005, "kl": 0.0, "learning_rate": 4.3167701863354035e-07, "loss": -0.0, "num_tokens": 15613108.0, "reward": 1.1375000476837158, "reward_std": 0.07373939454555511, "rewards/correctness_reward_func/mean": 0.8500000834465027, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 447 }, { "completion_length": 429.8333435058594, "completions/clipped_ratio": 0.5, "completions/max_length": 6589.0, "completions/max_terminated_length": 1278.0, "completions/mean_length": 3724.33349609375, "completions/mean_terminated_length": 859.6666870117188, "completions/min_length": 591.0, "completions/min_terminated_length": 591.0, "epoch": 0.1519674355495251, "frac_reward_zero_std": 1.0, "grad_norm": 1.0236065151048024e-07, "kl": NaN, "learning_rate": 4.315044858523119e-07, "loss": 0.0, "num_tokens": 15632792.0, "reward": 0.5500000715255737, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.15000000596046448, "rewards/format_reward_func/std": 0.15666989982128143, "step": 448 }, { "completion_length": 2166.666748046875, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 5026.0, "completions/mean_length": 3264.83349609375, "completions/mean_terminated_length": 2600.0, "completions/min_length": 1176.0, "completions/min_terminated_length": 1176.0, "epoch": 0.1523066485753053, "frac_reward_zero_std": 0.5, "grad_norm": 0.06184069439768791, "kl": NaN, "learning_rate": 4.313319530710835e-07, "loss": -0.0104, "num_tokens": 15670522.0, "reward": 0.6750000715255737, "reward_std": 0.06123724579811096, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 449 }, { "completion_length": 2323.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 6337.0, "completions/max_terminated_length": 6337.0, "completions/mean_length": 2323.5, "completions/mean_terminated_length": 2323.5, "completions/min_length": 855.0, "completions/min_terminated_length": 855.0, "epoch": 0.15264586160108548, "frac_reward_zero_std": 0.5, "grad_norm": 0.8851024508476257, "kl": 0.0, "learning_rate": 4.3115942028985506e-07, "loss": 0.0646, "num_tokens": 15709432.0, "reward": 1.0166667699813843, "reward_std": 0.24832776188850403, "rewards/correctness_reward_func/mean": 0.7166666984558105, "rewards/correctness_reward_func/std": 0.3459725081920624, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 450 }, { "completion_length": 2351.5001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 4691.0, "completions/max_terminated_length": 4691.0, "completions/mean_length": 2351.5, "completions/mean_terminated_length": 2351.5, "completions/min_length": 829.0, "completions/min_terminated_length": 829.0, "epoch": 0.15298507462686567, "frac_reward_zero_std": 1.0, "grad_norm": 2.1750710743617674e-07, "kl": 0.0, "learning_rate": 4.309868875086266e-07, "loss": 0.0, "num_tokens": 15750946.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 451 }, { "completion_length": 2530.0001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 5296.0, "completions/max_terminated_length": 5296.0, "completions/mean_length": 2530.0, "completions/mean_terminated_length": 2530.0, "completions/min_length": 480.0, "completions/min_terminated_length": 480.0, "epoch": 0.15332428765264586, "frac_reward_zero_std": 0.0, "grad_norm": 0.1575542837381363, "kl": 0.0, "learning_rate": 4.3081435472739817e-07, "loss": 0.0015, "num_tokens": 15794548.0, "reward": 1.1541666984558105, "reward_std": 0.08225835859775543, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 452 }, { "completion_length": 1643.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2960.0, "completions/max_terminated_length": 2960.0, "completions/mean_length": 1643.3333740234375, "completions/mean_terminated_length": 1643.3333740234375, "completions/min_length": 744.0, "completions/min_terminated_length": 744.0, "epoch": 0.15366350067842605, "frac_reward_zero_std": 1.0, "grad_norm": 1.257485422456739e-07, "kl": 0.0, "learning_rate": 4.306418219461698e-07, "loss": 0.0, "num_tokens": 15827666.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 453 }, { "completion_length": 1459.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 2622.0, "completions/max_terminated_length": 2622.0, "completions/mean_length": 1459.916748046875, "completions/mean_terminated_length": 1459.916748046875, "completions/min_length": 502.0, "completions/min_terminated_length": 502.0, "epoch": 0.15400271370420623, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.304692891649413e-07, "loss": 0.0, "num_tokens": 15854089.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 454 }, { "completion_length": 1496.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2525.0, "completions/max_terminated_length": 2525.0, "completions/mean_length": 1496.5, "completions/mean_terminated_length": 1496.5, "completions/min_length": 720.0, "completions/min_terminated_length": 720.0, "epoch": 0.15434192672998642, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.302967563837129e-07, "loss": 0.0, "num_tokens": 15882739.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 455 }, { "completion_length": 1633.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4204.0, "completions/max_terminated_length": 4204.0, "completions/mean_length": 1633.3333740234375, "completions/mean_terminated_length": 1633.3333740234375, "completions/min_length": 531.0, "completions/min_terminated_length": 531.0, "epoch": 0.1546811397557666, "frac_reward_zero_std": 0.5, "grad_norm": 0.5038626194000244, "kl": 0.0, "learning_rate": 4.301242236024845e-07, "loss": 0.0028, "num_tokens": 15915797.0, "reward": 1.066666603088379, "reward_std": 0.20655912160873413, "rewards/correctness_reward_func/mean": 0.7666666507720947, "rewards/correctness_reward_func/std": 0.3700941801071167, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 456 }, { "completion_length": 2144.416748046875, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 4475.0, "completions/mean_length": 3242.58349609375, "completions/mean_terminated_length": 2573.300048828125, "completions/min_length": 863.0, "completions/min_terminated_length": 863.0, "epoch": 0.1550203527815468, "frac_reward_zero_std": 0.0, "grad_norm": 1.2132185697555542, "kl": NaN, "learning_rate": 4.2995169082125604e-07, "loss": -0.0736, "num_tokens": 15955018.0, "reward": 0.9791667461395264, "reward_std": 0.3328944146633148, "rewards/correctness_reward_func/mean": 0.7166666984558105, "rewards/correctness_reward_func/std": 0.3459725081920624, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.09323723614215851, "step": 457 }, { "completion_length": 2428.7501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 3740.0, "completions/max_terminated_length": 3740.0, "completions/mean_length": 2428.75, "completions/mean_terminated_length": 2428.75, "completions/min_length": 1041.0, "completions/min_terminated_length": 1041.0, "epoch": 0.155359565807327, "frac_reward_zero_std": 0.5, "grad_norm": 0.09832356125116348, "kl": 0.0, "learning_rate": 4.297791580400276e-07, "loss": 0.0012, "num_tokens": 15992965.0, "reward": 1.1875, "reward_std": 0.03061862848699093, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 458 }, { "completion_length": 1184.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 2296.0, "completions/max_terminated_length": 2296.0, "completions/mean_length": 1184.166748046875, "completions/mean_terminated_length": 1184.166748046875, "completions/min_length": 524.0, "completions/min_terminated_length": 524.0, "epoch": 0.1556987788331072, "frac_reward_zero_std": 0.5, "grad_norm": 0.07985512912273407, "kl": 0.0, "learning_rate": 4.2960662525879914e-07, "loss": 0.0001, "num_tokens": 16022223.0, "reward": 1.25, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 459 }, { "completion_length": 2334.7501220703125, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 5195.0, "completions/mean_length": 3432.916748046875, "completions/mean_terminated_length": 2801.699951171875, "completions/min_length": 1164.0, "completions/min_terminated_length": 1164.0, "epoch": 0.1560379918588874, "frac_reward_zero_std": 0.5, "grad_norm": 0.36306536197662354, "kl": NaN, "learning_rate": 4.2943409247757075e-07, "loss": -0.0326, "num_tokens": 16063908.0, "reward": 0.9166667461395264, "reward_std": 0.2840188145637512, "rewards/correctness_reward_func/mean": 0.6666666865348816, "rewards/correctness_reward_func/std": 0.31139957904815674, "rewards/format_reward_func/mean": 0.2500000298023224, "rewards/format_reward_func/std": 0.11677484214305878, "step": 460 }, { "completion_length": 3764.7501220703125, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6420.0, "completions/mean_length": 4313.83349609375, "completions/mean_terminated_length": 4107.0, "completions/min_length": 2831.0, "completions/min_terminated_length": 2831.0, "epoch": 0.15637720488466758, "frac_reward_zero_std": 0.5, "grad_norm": 0.2848891615867615, "kl": NaN, "learning_rate": 4.292615596963423e-07, "loss": -0.0579, "num_tokens": 16124367.0, "reward": 0.6083333492279053, "reward_std": 0.22453658282756805, "rewards/correctness_reward_func/mean": 0.3333333432674408, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 461 }, { "completion_length": 1265.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3036.0, "completions/max_terminated_length": 3036.0, "completions/mean_length": 1265.8333740234375, "completions/mean_terminated_length": 1265.8333740234375, "completions/min_length": 575.0, "completions/min_terminated_length": 575.0, "epoch": 0.15671641791044777, "frac_reward_zero_std": 0.5, "grad_norm": 0.3097565770149231, "kl": 0.0, "learning_rate": 4.2908902691511386e-07, "loss": 0.0004, "num_tokens": 16151689.0, "reward": 0.5541666746139526, "reward_std": 0.19900795817375183, "rewards/correctness_reward_func/mean": 0.2666666805744171, "rewards/correctness_reward_func/std": 0.393892765045166, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 462 }, { "completion_length": 2514.33349609375, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 6211.0, "completions/mean_length": 3612.5, "completions/mean_terminated_length": 3017.199951171875, "completions/min_length": 594.0, "completions/min_terminated_length": 594.0, "epoch": 0.15705563093622796, "frac_reward_zero_std": 0.0, "grad_norm": 0.7741003632545471, "kl": NaN, "learning_rate": 4.289164941338854e-07, "loss": -0.045, "num_tokens": 16194395.0, "reward": 0.5, "reward_std": 0.35132092237472534, "rewards/correctness_reward_func/mean": 0.25, "rewards/correctness_reward_func/std": 0.45226702094078064, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.11677484214305878, "step": 463 }, { "completion_length": 1660.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2979.0, "completions/max_terminated_length": 2979.0, "completions/mean_length": 1660.916748046875, "completions/mean_terminated_length": 1660.916748046875, "completions/min_length": 679.0, "completions/min_terminated_length": 679.0, "epoch": 0.15739484396200815, "frac_reward_zero_std": 0.5, "grad_norm": 0.09117577224969864, "kl": 0.0, "learning_rate": 4.28743961352657e-07, "loss": -0.0022, "num_tokens": 16227238.0, "reward": 1.2708332538604736, "reward_std": 0.045871179550886154, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 464 }, { "completion_length": 2607.5001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 6129.0, "completions/max_terminated_length": 6129.0, "completions/mean_length": 2607.5, "completions/mean_terminated_length": 2607.5, "completions/min_length": 1522.0, "completions/min_terminated_length": 1522.0, "epoch": 0.15773405698778833, "frac_reward_zero_std": 0.5, "grad_norm": 0.12152191996574402, "kl": 0.0, "learning_rate": 4.285714285714285e-07, "loss": -0.0006, "num_tokens": 16271164.0, "reward": 1.1666667461395264, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 465 }, { "completion_length": 2674.166748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5492.0, "completions/mean_length": 3223.25, "completions/mean_terminated_length": 2917.272705078125, "completions/min_length": 643.0, "completions/min_terminated_length": 643.0, "epoch": 0.15807327001356852, "frac_reward_zero_std": 0.0, "grad_norm": 0.6836945414543152, "kl": NaN, "learning_rate": 4.283988957902001e-07, "loss": -0.0139, "num_tokens": 16311852.0, "reward": 0.8583333492279053, "reward_std": 0.28804606199264526, "rewards/correctness_reward_func/mean": 0.5833333134651184, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 466 }, { "completion_length": 616.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1650.0, "completions/max_terminated_length": 1650.0, "completions/mean_length": 616.0, "completions/mean_terminated_length": 616.0, "completions/min_length": 374.0, "completions/min_terminated_length": 374.0, "epoch": 0.1584124830393487, "frac_reward_zero_std": 0.5, "grad_norm": 0.06278982013463974, "kl": 0.0, "learning_rate": 4.2822636300897167e-07, "loss": 0.0002, "num_tokens": 16331100.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 467 }, { "completion_length": 1923.1666870117188, "completions/clipped_ratio": 0.41666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 6093.0, "completions/mean_length": 4668.58349609375, "completions/mean_terminated_length": 3296.857177734375, "completions/min_length": 932.0, "completions/min_terminated_length": 932.0, "epoch": 0.1587516960651289, "frac_reward_zero_std": 0.0, "grad_norm": 0.7384070158004761, "kl": NaN, "learning_rate": 4.280538302277433e-07, "loss": 0.0206, "num_tokens": 16367228.0, "reward": 0.37916669249534607, "reward_std": 0.3163924813270569, "rewards/correctness_reward_func/mean": 0.21666665375232697, "rewards/correctness_reward_func/std": 0.39504510164260864, "rewards/format_reward_func/mean": 0.16250000894069672, "rewards/format_reward_func/std": 0.14943073689937592, "step": 468 }, { "completion_length": 2877.7501220703125, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 5040.0, "completions/mean_length": 3975.916748046875, "completions/mean_terminated_length": 3453.300048828125, "completions/min_length": 1252.0, "completions/min_terminated_length": 1252.0, "epoch": 0.1590909090909091, "frac_reward_zero_std": 0.0, "grad_norm": 0.8427397012710571, "kl": NaN, "learning_rate": 4.278812974465148e-07, "loss": -0.0179, "num_tokens": 16411697.0, "reward": 0.4624999761581421, "reward_std": 0.35593757033348083, "rewards/correctness_reward_func/mean": 0.25, "rewards/correctness_reward_func/std": 0.45226702094078064, "rewards/format_reward_func/mean": 0.21250002086162567, "rewards/format_reward_func/std": 0.11894422769546509, "step": 469 }, { "completion_length": 971.0000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1860.0, "completions/max_terminated_length": 1860.0, "completions/mean_length": 971.0, "completions/mean_terminated_length": 971.0, "completions/min_length": 584.0, "completions/min_terminated_length": 584.0, "epoch": 0.15943012211668928, "frac_reward_zero_std": 1.0, "grad_norm": 1.582651520948275e-07, "kl": 0.0, "learning_rate": 4.277087646652864e-07, "loss": 0.0, "num_tokens": 16437101.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 470 }, { "completion_length": 877.5833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1520.0, "completions/max_terminated_length": 1520.0, "completions/mean_length": 877.5833740234375, "completions/mean_terminated_length": 877.5833740234375, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 0.15976933514246947, "frac_reward_zero_std": 0.0, "grad_norm": 0.317609041929245, "kl": 0.0, "learning_rate": 4.2753623188405794e-07, "loss": -0.0005, "num_tokens": 16457586.0, "reward": 0.6791666746139526, "reward_std": 0.22598153352737427, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.5149286985397339, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.06784005463123322, "step": 471 }, { "completion_length": 1876.166748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 3251.0, "completions/mean_length": 2425.25, "completions/mean_terminated_length": 2046.727294921875, "completions/min_length": 1071.0, "completions/min_terminated_length": 1071.0, "epoch": 0.16010854816824965, "frac_reward_zero_std": 0.5, "grad_norm": 0.21075165271759033, "kl": NaN, "learning_rate": 4.2736369910282954e-07, "loss": -0.0226, "num_tokens": 16495232.0, "reward": 1.0250000953674316, "reward_std": 0.23611438274383545, "rewards/correctness_reward_func/mean": 0.75, "rewards/correctness_reward_func/std": 0.24308621883392334, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 472 }, { "completion_length": 2999.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 5298.0, "completions/max_terminated_length": 5298.0, "completions/mean_length": 2999.33349609375, "completions/mean_terminated_length": 2999.33349609375, "completions/min_length": 931.0, "completions/min_terminated_length": 931.0, "epoch": 0.16044776119402984, "frac_reward_zero_std": 0.5, "grad_norm": 0.6389744877815247, "kl": 0.0, "learning_rate": 4.271911663216011e-07, "loss": -0.0009, "num_tokens": 16545672.0, "reward": 0.6333333253860474, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.3333333432674408, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 473 }, { "completion_length": 1288.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3679.0, "completions/max_terminated_length": 3679.0, "completions/mean_length": 1288.75, "completions/mean_terminated_length": 1288.75, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 0.16078697421981003, "frac_reward_zero_std": 0.0, "grad_norm": 0.43943166732788086, "kl": 0.0, "learning_rate": 4.2701863354037265e-07, "loss": 0.0136, "num_tokens": 16575927.0, "reward": 0.6125000715255737, "reward_std": 0.21714738011360168, "rewards/correctness_reward_func/mean": 0.3500000238418579, "rewards/correctness_reward_func/std": 0.4358898997306824, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.06784005463123322, "step": 474 }, { "completion_length": 1136.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2292.0, "completions/max_terminated_length": 2292.0, "completions/mean_length": 1136.25, "completions/mean_terminated_length": 1136.25, "completions/min_length": 605.0, "completions/min_terminated_length": 605.0, "epoch": 0.16112618724559022, "frac_reward_zero_std": 0.5, "grad_norm": 0.04957255721092224, "kl": 0.0, "learning_rate": 4.2684610075914425e-07, "loss": 0.0003, "num_tokens": 16598586.0, "reward": 0.6875001788139343, "reward_std": 0.03061862289905548, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 475 }, { "completion_length": 3438.916748046875, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 5689.0, "completions/mean_length": 5086.1669921875, "completions/mean_terminated_length": 4585.22216796875, "completions/min_length": 3284.0, "completions/min_terminated_length": 3284.0, "epoch": 0.1614654002713704, "frac_reward_zero_std": 0.5, "grad_norm": 0.16727101802825928, "kl": NaN, "learning_rate": 4.2667356797791575e-07, "loss": -0.0145, "num_tokens": 16649081.0, "reward": 0.1875, "reward_std": 0.041079193353652954, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.1875, "rewards/format_reward_func/std": 0.12990382313728333, "step": 476 }, { "completion_length": 2295.2501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 5441.0, "completions/max_terminated_length": 5441.0, "completions/mean_length": 2295.25, "completions/mean_terminated_length": 2295.25, "completions/min_length": 550.0, "completions/min_terminated_length": 550.0, "epoch": 0.16180461329715062, "frac_reward_zero_std": 0.5, "grad_norm": 0.6933732032775879, "kl": 0.0, "learning_rate": 4.2650103519668736e-07, "loss": 0.019, "num_tokens": 16686392.0, "reward": 1.0833334922790527, "reward_std": 0.19407901167869568, "rewards/correctness_reward_func/mean": 0.7833333015441895, "rewards/correctness_reward_func/std": 0.26227444410324097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 477 }, { "completion_length": 2071.3333740234375, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 5041.0, "completions/mean_length": 3718.58349609375, "completions/mean_terminated_length": 2761.77783203125, "completions/min_length": 963.0, "completions/min_terminated_length": 963.0, "epoch": 0.1621438263229308, "frac_reward_zero_std": 0.0, "grad_norm": 0.9527459740638733, "kl": NaN, "learning_rate": 4.263285024154589e-07, "loss": -0.0759, "num_tokens": 16724532.0, "reward": 0.7916667461395264, "reward_std": 0.31841057538986206, "rewards/correctness_reward_func/mean": 0.5666666626930237, "rewards/correctness_reward_func/std": 0.42497774958610535, "rewards/format_reward_func/mean": 0.2250000238418579, "rewards/format_reward_func/std": 0.11965861171483994, "step": 478 }, { "completion_length": 2586.0001220703125, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6358.0, "completions/mean_length": 3135.08349609375, "completions/mean_terminated_length": 2821.091064453125, "completions/min_length": 727.0, "completions/min_terminated_length": 727.0, "epoch": 0.162483039348711, "frac_reward_zero_std": 0.5, "grad_norm": 0.6366803646087646, "kl": NaN, "learning_rate": 4.261559696342305e-07, "loss": -0.0671, "num_tokens": 16771362.0, "reward": 1.191666603088379, "reward_std": 0.26536136865615845, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 479 }, { "completion_length": 712.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1226.0, "completions/max_terminated_length": 1226.0, "completions/mean_length": 712.5, "completions/mean_terminated_length": 712.5, "completions/min_length": 472.0, "completions/min_terminated_length": 472.0, "epoch": 0.1628222523744912, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.25983436853002e-07, "loss": 0.0, "num_tokens": 16790934.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 480 }, { "completion_length": 1154.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2626.0, "completions/max_terminated_length": 2626.0, "completions/mean_length": 1154.666748046875, "completions/mean_terminated_length": 1154.666748046875, "completions/min_length": 672.0, "completions/min_terminated_length": 672.0, "epoch": 0.16316146540027138, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.258109040717736e-07, "loss": 0.0, "num_tokens": 16819826.0, "reward": 0.30000001192092896, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 481 }, { "completion_length": 3743.750244140625, "completions/clipped_ratio": 0.0, "completions/max_length": 6112.0, "completions/max_terminated_length": 6112.0, "completions/mean_length": 3743.75, "completions/mean_terminated_length": 3743.75, "completions/min_length": 1972.0, "completions/min_terminated_length": 1972.0, "epoch": 0.16350067842605157, "frac_reward_zero_std": 1.0, "grad_norm": 4.1540701545272896e-07, "kl": 0.0, "learning_rate": 4.256383712905452e-07, "loss": 0.0, "num_tokens": 16876589.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 482 }, { "completion_length": 2762.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5686.0, "completions/max_terminated_length": 5686.0, "completions/mean_length": 2762.916748046875, "completions/mean_terminated_length": 2762.916748046875, "completions/min_length": 1178.0, "completions/min_terminated_length": 1178.0, "epoch": 0.16383989145183175, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.254658385093168e-07, "loss": 0.0, "num_tokens": 16923526.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 483 }, { "completion_length": 2833.1666870117188, "completions/clipped_ratio": 0.33333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6037.0, "completions/mean_length": 5029.5, "completions/mean_terminated_length": 4249.75, "completions/min_length": 2548.0, "completions/min_terminated_length": 2548.0, "epoch": 0.16417910447761194, "frac_reward_zero_std": 0.5, "grad_norm": 0.6901265978813171, "kl": NaN, "learning_rate": 4.252933057280883e-07, "loss": -0.0956, "num_tokens": 16970076.0, "reward": 0.7333334684371948, "reward_std": 0.2840188145637512, "rewards/correctness_reward_func/mean": 0.5333333611488342, "rewards/correctness_reward_func/std": 0.3938928246498108, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.14770980179309845, "step": 484 }, { "completion_length": 1643.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2812.0, "completions/max_terminated_length": 2812.0, "completions/mean_length": 1643.666748046875, "completions/mean_terminated_length": 1643.666748046875, "completions/min_length": 969.0, "completions/min_terminated_length": 969.0, "epoch": 0.16451831750339213, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.251207729468599e-07, "loss": 0.0, "num_tokens": 17005076.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 485 }, { "completion_length": 1820.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3135.0, "completions/max_terminated_length": 3135.0, "completions/mean_length": 1820.8333740234375, "completions/mean_terminated_length": 1820.8333740234375, "completions/min_length": 499.0, "completions/min_terminated_length": 499.0, "epoch": 0.16485753052917232, "frac_reward_zero_std": 0.0, "grad_norm": 0.6359934210777283, "kl": 0.0, "learning_rate": 4.2494824016563144e-07, "loss": 0.0045, "num_tokens": 17038380.0, "reward": 1.004166841506958, "reward_std": 0.28193777799606323, "rewards/correctness_reward_func/mean": 0.7166666984558105, "rewards/correctness_reward_func/std": 0.3459725081920624, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 486 }, { "completion_length": 1532.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3784.0, "completions/max_terminated_length": 3784.0, "completions/mean_length": 1532.416748046875, "completions/mean_terminated_length": 1532.416748046875, "completions/min_length": 406.0, "completions/min_terminated_length": 406.0, "epoch": 0.1651967435549525, "frac_reward_zero_std": 0.5, "grad_norm": 0.5672784447669983, "kl": 0.0, "learning_rate": 4.24775707384403e-07, "loss": 0.0126, "num_tokens": 17068793.0, "reward": 0.6499999761581421, "reward_std": 0.17606817185878754, "rewards/correctness_reward_func/mean": 0.3500000238418579, "rewards/correctness_reward_func/std": 0.4358898997306824, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 487 }, { "completion_length": 1676.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 2966.0, "completions/max_terminated_length": 2966.0, "completions/mean_length": 1676.166748046875, "completions/mean_terminated_length": 1676.166748046875, "completions/min_length": 652.0, "completions/min_terminated_length": 652.0, "epoch": 0.1655359565807327, "frac_reward_zero_std": 0.0, "grad_norm": 0.6939499378204346, "kl": 0.0, "learning_rate": 4.246031746031746e-07, "loss": -0.0032, "num_tokens": 17098855.0, "reward": 0.5708333253860474, "reward_std": 0.43851161003112793, "rewards/correctness_reward_func/mean": 0.28333333134651184, "rewards/correctness_reward_func/std": 0.4217568039894104, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 488 }, { "completion_length": 608.5, "completions/clipped_ratio": 0.0, "completions/max_length": 911.0, "completions/max_terminated_length": 911.0, "completions/mean_length": 608.5, "completions/mean_terminated_length": 608.5, "completions/min_length": 380.0, "completions/min_terminated_length": 380.0, "epoch": 0.16587516960651288, "frac_reward_zero_std": 1.0, "grad_norm": 9.13784745648627e-08, "kl": 0.0, "learning_rate": 4.2443064182194615e-07, "loss": 0.0, "num_tokens": 17118703.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 489 }, { "completion_length": 1851.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4297.0, "completions/max_terminated_length": 4297.0, "completions/mean_length": 1851.8333740234375, "completions/mean_terminated_length": 1851.8333740234375, "completions/min_length": 495.0, "completions/min_terminated_length": 495.0, "epoch": 0.16621438263229307, "frac_reward_zero_std": 0.0, "grad_norm": 0.16395634412765503, "kl": 0.0, "learning_rate": 4.2425810904071776e-07, "loss": -0.0026, "num_tokens": 17150327.0, "reward": 1.1666667461395264, "reward_std": 0.09559705853462219, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 490 }, { "completion_length": 1038.9167175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 2055.0, "completions/max_terminated_length": 2055.0, "completions/mean_length": 1038.916748046875, "completions/mean_terminated_length": 1038.916748046875, "completions/min_length": 536.0, "completions/min_terminated_length": 536.0, "epoch": 0.16655359565807326, "frac_reward_zero_std": 0.0, "grad_norm": 0.10568089783191681, "kl": 0.0, "learning_rate": 4.2408557625948926e-07, "loss": -0.0016, "num_tokens": 17176492.0, "reward": 0.7375000715255737, "reward_std": 0.0853908583521843, "rewards/correctness_reward_func/mean": 0.44999995827674866, "rewards/correctness_reward_func/std": 0.47577688097953796, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 491 }, { "completion_length": 2283.416748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6078.0, "completions/mean_length": 2832.5, "completions/mean_terminated_length": 2491.0, "completions/min_length": 654.0, "completions/min_terminated_length": 654.0, "epoch": 0.16689280868385345, "frac_reward_zero_std": 0.5, "grad_norm": 0.4687753915786743, "kl": NaN, "learning_rate": 4.2391304347826086e-07, "loss": -0.0141, "num_tokens": 17219097.0, "reward": 0.8083333969116211, "reward_std": 0.23327383399009705, "rewards/correctness_reward_func/mean": 0.5333333611488342, "rewards/correctness_reward_func/std": 0.3938928246498108, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 492 }, { "completion_length": 2379.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5512.0, "completions/max_terminated_length": 5512.0, "completions/mean_length": 2379.666748046875, "completions/mean_terminated_length": 2379.666748046875, "completions/min_length": 583.0, "completions/min_terminated_length": 583.0, "epoch": 0.16723202170963364, "frac_reward_zero_std": 0.0, "grad_norm": 0.5325803756713867, "kl": 0.0, "learning_rate": 4.237405106970324e-07, "loss": 0.0065, "num_tokens": 17260247.0, "reward": 0.9500000476837158, "reward_std": 0.27386125922203064, "rewards/correctness_reward_func/mean": 0.6499999761581421, "rewards/correctness_reward_func/std": 0.40113475918769836, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 493 }, { "completion_length": 2168.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3947.0, "completions/max_terminated_length": 3947.0, "completions/mean_length": 2168.666748046875, "completions/mean_terminated_length": 2168.666748046875, "completions/min_length": 1066.0, "completions/min_terminated_length": 1066.0, "epoch": 0.16757123473541383, "frac_reward_zero_std": 1.0, "grad_norm": 1.7554351927628886e-07, "kl": 0.0, "learning_rate": 4.23567977915804e-07, "loss": 0.0, "num_tokens": 17298409.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 494 }, { "completion_length": 1945.0001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 4646.0, "completions/max_terminated_length": 4646.0, "completions/mean_length": 1945.0, "completions/mean_terminated_length": 1945.0, "completions/min_length": 657.0, "completions/min_terminated_length": 657.0, "epoch": 0.16791044776119404, "frac_reward_zero_std": 1.0, "grad_norm": 2.706338761981897e-07, "kl": 0.0, "learning_rate": 4.233954451345755e-07, "loss": 0.0, "num_tokens": 17336221.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 495 }, { "completion_length": 1856.25, "completions/clipped_ratio": 0.0, "completions/max_length": 4646.0, "completions/max_terminated_length": 4646.0, "completions/mean_length": 1856.25, "completions/mean_terminated_length": 1856.25, "completions/min_length": 642.0, "completions/min_terminated_length": 642.0, "epoch": 0.16824966078697423, "frac_reward_zero_std": 0.0, "grad_norm": 0.0943993553519249, "kl": 0.0, "learning_rate": 4.2322291235334713e-07, "loss": -0.0045, "num_tokens": 17374858.0, "reward": 1.2666666507720947, "reward_std": 0.08164961636066437, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 496 }, { "completion_length": 1558.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3671.0, "completions/max_terminated_length": 3671.0, "completions/mean_length": 1558.75, "completions/mean_terminated_length": 1558.75, "completions/min_length": 586.0, "completions/min_terminated_length": 586.0, "epoch": 0.16858887381275442, "frac_reward_zero_std": 1.0, "grad_norm": 1.6221547127770464e-07, "kl": 0.0, "learning_rate": 4.230503795721187e-07, "loss": 0.0, "num_tokens": 17402695.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 497 }, { "completion_length": 3426.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 6069.0, "completions/max_terminated_length": 6069.0, "completions/mean_length": 3426.666748046875, "completions/mean_terminated_length": 3426.666748046875, "completions/min_length": 1866.0, "completions/min_terminated_length": 1866.0, "epoch": 0.1689280868385346, "frac_reward_zero_std": 0.0, "grad_norm": 0.6243091821670532, "kl": 0.0, "learning_rate": 4.2287784679089023e-07, "loss": 0.0319, "num_tokens": 17453313.0, "reward": 0.4541667103767395, "reward_std": 0.28881752490997314, "rewards/correctness_reward_func/mean": 0.1666666716337204, "rewards/correctness_reward_func/std": 0.3892494738101959, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 498 }, { "completion_length": 1164.6667175292969, "completions/clipped_ratio": 0.5, "completions/max_length": 6589.0, "completions/max_terminated_length": 3076.0, "completions/mean_length": 4459.1669921875, "completions/mean_terminated_length": 2329.33349609375, "completions/min_length": 1407.0, "completions/min_terminated_length": 1407.0, "epoch": 0.1692672998643148, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": NaN, "learning_rate": 4.227053140096618e-07, "loss": 0.0, "num_tokens": 17482115.0, "reward": 0.15000000596046448, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.15000000596046448, "rewards/format_reward_func/std": 0.15666989982128143, "step": 499 }, { "completion_length": 1079.5000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 2631.0, "completions/max_terminated_length": 2631.0, "completions/mean_length": 1079.5, "completions/mean_terminated_length": 1079.5, "completions/min_length": 621.0, "completions/min_terminated_length": 621.0, "epoch": 0.16960651289009498, "frac_reward_zero_std": 1.0, "grad_norm": 9.258025102099054e-08, "kl": 0.0, "learning_rate": 4.225327812284334e-07, "loss": 0.0, "num_tokens": 17506619.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 500 }, { "completion_length": 1423.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2845.0, "completions/max_terminated_length": 2845.0, "completions/mean_length": 1423.0833740234375, "completions/mean_terminated_length": 1423.0833740234375, "completions/min_length": 978.0, "completions/min_terminated_length": 978.0, "epoch": 0.16994572591587517, "frac_reward_zero_std": 0.5, "grad_norm": 0.1390148103237152, "kl": 0.0, "learning_rate": 4.2236024844720495e-07, "loss": 0.0015, "num_tokens": 17536302.0, "reward": 1.133333444595337, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 501 }, { "completion_length": 1925.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3312.0, "completions/max_terminated_length": 3312.0, "completions/mean_length": 1925.916748046875, "completions/mean_terminated_length": 1925.916748046875, "completions/min_length": 1054.0, "completions/min_terminated_length": 1054.0, "epoch": 0.17028493894165536, "frac_reward_zero_std": 1.0, "grad_norm": 3.353748923018429e-07, "kl": 0.0, "learning_rate": 4.221877156659765e-07, "loss": 0.0, "num_tokens": 17568329.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 502 }, { "completion_length": 2333.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4467.0, "completions/max_terminated_length": 4467.0, "completions/mean_length": 2333.58349609375, "completions/mean_terminated_length": 2333.58349609375, "completions/min_length": 696.0, "completions/min_terminated_length": 696.0, "epoch": 0.17062415196743555, "frac_reward_zero_std": 0.5, "grad_norm": 0.10262121260166168, "kl": 0.0, "learning_rate": 4.220151828847481e-07, "loss": -0.0002, "num_tokens": 17607348.0, "reward": 1.1500000953674316, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.8500000834465027, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 503 }, { "completion_length": 2920.8334350585938, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6092.0, "completions/mean_length": 3469.916748046875, "completions/mean_terminated_length": 3186.36376953125, "completions/min_length": 556.0, "completions/min_terminated_length": 556.0, "epoch": 0.17096336499321574, "frac_reward_zero_std": 0.0, "grad_norm": 0.6888859868049622, "kl": NaN, "learning_rate": 4.2184265010351966e-07, "loss": -0.0503, "num_tokens": 17652208.0, "reward": 1.0833333730697632, "reward_std": 0.22770795226097107, "rewards/correctness_reward_func/mean": 0.7833333015441895, "rewards/correctness_reward_func/std": 0.26227444410324097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 504 }, { "completion_length": 2582.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4693.0, "completions/max_terminated_length": 4693.0, "completions/mean_length": 2582.166748046875, "completions/mean_terminated_length": 2582.166748046875, "completions/min_length": 887.0, "completions/min_terminated_length": 887.0, "epoch": 0.17130257801899593, "frac_reward_zero_std": 0.5, "grad_norm": 0.13936969637870789, "kl": 0.0, "learning_rate": 4.2167011732229126e-07, "loss": -0.0029, "num_tokens": 17696994.0, "reward": 1.25, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 505 }, { "completion_length": 1910.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 4204.0, "completions/max_terminated_length": 4204.0, "completions/mean_length": 1910.166748046875, "completions/mean_terminated_length": 1910.166748046875, "completions/min_length": 529.0, "completions/min_terminated_length": 529.0, "epoch": 0.17164179104477612, "frac_reward_zero_std": 0.5, "grad_norm": 0.7006736397743225, "kl": 0.0, "learning_rate": 4.2149758454106276e-07, "loss": 0.0117, "num_tokens": 17730788.0, "reward": 0.9500000476837158, "reward_std": 0.27386125922203064, "rewards/correctness_reward_func/mean": 0.6499999761581421, "rewards/correctness_reward_func/std": 0.40113475918769836, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 506 }, { "completion_length": 1539.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3405.0, "completions/max_terminated_length": 3405.0, "completions/mean_length": 1539.416748046875, "completions/mean_terminated_length": 1539.416748046875, "completions/min_length": 449.0, "completions/min_terminated_length": 449.0, "epoch": 0.1719810040705563, "frac_reward_zero_std": 0.5, "grad_norm": 0.11347401887178421, "kl": 0.0, "learning_rate": 4.2132505175983437e-07, "loss": -0.0027, "num_tokens": 17758231.0, "reward": 1.133333444595337, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 507 }, { "completion_length": 1861.0834350585938, "completions/clipped_ratio": 0.0, "completions/max_length": 4365.0, "completions/max_terminated_length": 4365.0, "completions/mean_length": 1861.0833740234375, "completions/mean_terminated_length": 1861.0833740234375, "completions/min_length": 472.0, "completions/min_terminated_length": 472.0, "epoch": 0.1723202170963365, "frac_reward_zero_std": 0.5, "grad_norm": 0.16115796566009521, "kl": 0.0, "learning_rate": 4.211525189786059e-07, "loss": 0.0001, "num_tokens": 17789888.0, "reward": 1.1500000953674316, "reward_std": 0.05477222427725792, "rewards/correctness_reward_func/mean": 0.8499999642372131, "rewards/correctness_reward_func/std": 0.09045340120792389, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 508 }, { "completion_length": 1629.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3168.0, "completions/max_terminated_length": 3168.0, "completions/mean_length": 1629.8333740234375, "completions/mean_terminated_length": 1629.8333740234375, "completions/min_length": 874.0, "completions/min_terminated_length": 874.0, "epoch": 0.17265943012211668, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.209799861973775e-07, "loss": 0.0, "num_tokens": 17820948.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 509 }, { "completion_length": 1754.2501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 3233.0, "completions/max_terminated_length": 3233.0, "completions/mean_length": 1754.25, "completions/mean_terminated_length": 1754.25, "completions/min_length": 771.0, "completions/min_terminated_length": 771.0, "epoch": 0.17299864314789687, "frac_reward_zero_std": 0.5, "grad_norm": 0.33984920382499695, "kl": 0.0, "learning_rate": 4.2080745341614903e-07, "loss": 0.01, "num_tokens": 17851845.0, "reward": 0.36666667461395264, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.06666667014360428, "rewards/correctness_reward_func/std": 0.23094011843204498, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 510 }, { "completion_length": 1409.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4162.0, "completions/max_terminated_length": 4162.0, "completions/mean_length": 1409.3333740234375, "completions/mean_terminated_length": 1409.3333740234375, "completions/min_length": 551.0, "completions/min_terminated_length": 551.0, "epoch": 0.17333785617367706, "frac_reward_zero_std": 0.5, "grad_norm": 0.3745829463005066, "kl": 0.0, "learning_rate": 4.2063492063492063e-07, "loss": 0.0158, "num_tokens": 17880055.0, "reward": 0.833333432674408, "reward_std": 0.20655912160873413, "rewards/correctness_reward_func/mean": 0.5333333611488342, "rewards/correctness_reward_func/std": 0.3938928246498108, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 511 }, { "completion_length": 1085.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1805.0, "completions/max_terminated_length": 1805.0, "completions/mean_length": 1085.916748046875, "completions/mean_terminated_length": 1085.916748046875, "completions/min_length": 630.0, "completions/min_terminated_length": 630.0, "epoch": 0.17367706919945725, "frac_reward_zero_std": 0.5, "grad_norm": 0.05689301714301109, "kl": 0.0, "learning_rate": 4.204623878536922e-07, "loss": -0.0001, "num_tokens": 17904756.0, "reward": 1.0875000953674316, "reward_std": 0.030618613585829735, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 512 }, { "completion_length": 2082.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3168.0, "completions/max_terminated_length": 3168.0, "completions/mean_length": 2082.83349609375, "completions/mean_terminated_length": 2082.83349609375, "completions/min_length": 1306.0, "completions/min_terminated_length": 1306.0, "epoch": 0.17401628222523746, "frac_reward_zero_std": 0.5, "grad_norm": 0.502258837223053, "kl": 0.0, "learning_rate": 4.2028985507246374e-07, "loss": 0.0137, "num_tokens": 17941312.0, "reward": 0.5666667222976685, "reward_std": 0.20655910670757294, "rewards/correctness_reward_func/mean": 0.2666666805744171, "rewards/correctness_reward_func/std": 0.393892765045166, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 513 }, { "completion_length": 1895.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3063.0, "completions/max_terminated_length": 3063.0, "completions/mean_length": 1895.916748046875, "completions/mean_terminated_length": 1895.916748046875, "completions/min_length": 1216.0, "completions/min_terminated_length": 1216.0, "epoch": 0.17435549525101765, "frac_reward_zero_std": 1.0, "grad_norm": 1.2023741646771668e-07, "kl": 0.0, "learning_rate": 4.201173222912353e-07, "loss": 0.0, "num_tokens": 17976897.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 514 }, { "completion_length": 3192.2501220703125, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 6415.0, "completions/mean_length": 4290.4169921875, "completions/mean_terminated_length": 3830.699951171875, "completions/min_length": 1845.0, "completions/min_terminated_length": 1845.0, "epoch": 0.17469470827679784, "frac_reward_zero_std": 0.0, "grad_norm": 0.8532640933990479, "kl": NaN, "learning_rate": 4.199447895100069e-07, "loss": -0.0693, "num_tokens": 18027606.0, "reward": 0.9666666984558105, "reward_std": 0.3814123868942261, "rewards/correctness_reward_func/mean": 0.7166666984558105, "rewards/correctness_reward_func/std": 0.4386618733406067, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.09770084172487259, "step": 515 }, { "completion_length": 2581.5001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 6207.0, "completions/max_terminated_length": 6207.0, "completions/mean_length": 2581.5, "completions/mean_terminated_length": 2581.5, "completions/min_length": 869.0, "completions/min_terminated_length": 869.0, "epoch": 0.17503392130257803, "frac_reward_zero_std": 0.0, "grad_norm": 0.8625640869140625, "kl": 0.0, "learning_rate": 4.1977225672877845e-07, "loss": 0.0173, "num_tokens": 18071898.0, "reward": 0.8166667222976685, "reward_std": 0.46741676330566406, "rewards/correctness_reward_func/mean": 0.5166666507720947, "rewards/correctness_reward_func/std": 0.4628632962703705, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 516 }, { "completion_length": 1080.3333587646484, "completions/clipped_ratio": 0.0, "completions/max_length": 3990.0, "completions/max_terminated_length": 3990.0, "completions/mean_length": 1080.3333740234375, "completions/mean_terminated_length": 1080.3333740234375, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.17537313432835822, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.1959972394755e-07, "loss": 0.0, "num_tokens": 18096424.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 517 }, { "completion_length": 830.0833587646484, "completions/clipped_ratio": 0.5, "completions/max_length": 6589.0, "completions/max_terminated_length": 2506.0, "completions/mean_length": 4124.58349609375, "completions/mean_terminated_length": 1660.166748046875, "completions/min_length": 945.0, "completions/min_terminated_length": 945.0, "epoch": 0.1757123473541384, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": NaN, "learning_rate": 4.1942719116632156e-07, "loss": 0.0, "num_tokens": 18123209.0, "reward": 0.15000000596046448, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.15000000596046448, "rewards/format_reward_func/std": 0.15666989982128143, "step": 518 }, { "completion_length": 2415.0001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 4186.0, "completions/max_terminated_length": 4186.0, "completions/mean_length": 2415.0, "completions/mean_terminated_length": 2415.0, "completions/min_length": 884.0, "completions/min_terminated_length": 884.0, "epoch": 0.1760515603799186, "frac_reward_zero_std": 1.0, "grad_norm": 1.772355489038091e-07, "kl": 0.0, "learning_rate": 4.1925465838509316e-07, "loss": 0.0, "num_tokens": 18164189.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 519 }, { "completion_length": 862.75, "completions/clipped_ratio": 0.33333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 2617.0, "completions/mean_length": 3059.08349609375, "completions/mean_terminated_length": 1294.125, "completions/min_length": 738.0, "completions/min_terminated_length": 738.0, "epoch": 0.17639077340569878, "frac_reward_zero_std": 0.5, "grad_norm": 1.4193612337112427, "kl": NaN, "learning_rate": 4.190821256038647e-07, "loss": -0.0477, "num_tokens": 18186080.0, "reward": 0.7666667699813843, "reward_std": 0.3356585204601288, "rewards/correctness_reward_func/mean": 0.5666666626930237, "rewards/correctness_reward_func/std": 0.42497774958610535, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.14770980179309845, "step": 520 }, { "completion_length": 2900.5001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 4391.0, "completions/max_terminated_length": 4391.0, "completions/mean_length": 2900.5, "completions/mean_terminated_length": 2900.5, "completions/min_length": 938.0, "completions/min_terminated_length": 938.0, "epoch": 0.17672998643147897, "frac_reward_zero_std": 1.0, "grad_norm": 4.6848953161315876e-07, "kl": 0.0, "learning_rate": 4.1890959282263627e-07, "loss": 0.0, "num_tokens": 18233546.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 521 }, { "completion_length": 3361.8333740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5968.0, "completions/mean_length": 3910.916748046875, "completions/mean_terminated_length": 3667.45458984375, "completions/min_length": 949.0, "completions/min_terminated_length": 949.0, "epoch": 0.17706919945725916, "frac_reward_zero_std": 0.5, "grad_norm": 0.06511241942644119, "kl": NaN, "learning_rate": 4.1873706004140787e-07, "loss": -0.0128, "num_tokens": 18285468.0, "reward": 0.6750000715255737, "reward_std": 0.06123724579811096, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 522 }, { "completion_length": 1127.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3259.0, "completions/max_terminated_length": 3259.0, "completions/mean_length": 1127.75, "completions/mean_terminated_length": 1127.75, "completions/min_length": 552.0, "completions/min_terminated_length": 552.0, "epoch": 0.17740841248303935, "frac_reward_zero_std": 0.5, "grad_norm": 0.09431690722703934, "kl": 0.0, "learning_rate": 4.185645272601794e-07, "loss": 0.0045, "num_tokens": 18310161.0, "reward": 1.1666667461395264, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 523 }, { "completion_length": 2173.0834350585938, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5945.0, "completions/mean_length": 2722.166748046875, "completions/mean_terminated_length": 2370.636474609375, "completions/min_length": 905.0, "completions/min_terminated_length": 905.0, "epoch": 0.17774762550881953, "frac_reward_zero_std": 0.0, "grad_norm": 0.13693396747112274, "kl": NaN, "learning_rate": 4.18391994478951e-07, "loss": -0.0113, "num_tokens": 18345670.0, "reward": 0.7250000834465027, "reward_std": 0.11600948870182037, "rewards/correctness_reward_func/mean": 0.44999995827674866, "rewards/correctness_reward_func/std": 0.47577688097953796, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 524 }, { "completion_length": 1425.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3034.0, "completions/max_terminated_length": 3034.0, "completions/mean_length": 1425.166748046875, "completions/mean_terminated_length": 1425.166748046875, "completions/min_length": 568.0, "completions/min_terminated_length": 568.0, "epoch": 0.17808683853459972, "frac_reward_zero_std": 0.0, "grad_norm": 0.12271206825971603, "kl": 0.0, "learning_rate": 4.1821946169772253e-07, "loss": 0.0042, "num_tokens": 18373776.0, "reward": 1.2333333492279053, "reward_std": 0.0955970510840416, "rewards/correctness_reward_func/mean": 0.9333333969116211, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 525 }, { "completion_length": 3609.0001220703125, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6538.0, "completions/mean_length": 4158.08349609375, "completions/mean_terminated_length": 3937.091064453125, "completions/min_length": 863.0, "completions/min_terminated_length": 863.0, "epoch": 0.1784260515603799, "frac_reward_zero_std": 0.5, "grad_norm": 1.16100013256073, "kl": NaN, "learning_rate": 4.1804692891649414e-07, "loss": -0.0366, "num_tokens": 18428916.0, "reward": 0.3583333194255829, "reward_std": 0.22453653812408447, "rewards/correctness_reward_func/mean": 0.0833333358168602, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 526 }, { "completion_length": 2267.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4891.0, "completions/max_terminated_length": 4891.0, "completions/mean_length": 2267.666748046875, "completions/mean_terminated_length": 2267.666748046875, "completions/min_length": 1136.0, "completions/min_terminated_length": 1136.0, "epoch": 0.1787652645861601, "frac_reward_zero_std": 0.5, "grad_norm": 0.5539312958717346, "kl": 0.0, "learning_rate": 4.178743961352657e-07, "loss": 0.0149, "num_tokens": 18470840.0, "reward": 1.0500000715255737, "reward_std": 0.17606817185878754, "rewards/correctness_reward_func/mean": 0.75, "rewards/correctness_reward_func/std": 0.24308621883392334, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 527 }, { "completion_length": 666.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1261.0, "completions/max_terminated_length": 1261.0, "completions/mean_length": 666.25, "completions/mean_terminated_length": 666.25, "completions/min_length": 388.0, "completions/min_terminated_length": 388.0, "epoch": 0.1791044776119403, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.1770186335403724e-07, "loss": 0.0, "num_tokens": 18494627.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 528 }, { "completion_length": 1161.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2421.0, "completions/max_terminated_length": 2421.0, "completions/mean_length": 1161.5, "completions/mean_terminated_length": 1161.5, "completions/min_length": 471.0, "completions/min_terminated_length": 471.0, "epoch": 0.17944369063772048, "frac_reward_zero_std": 1.0, "grad_norm": 1.6599263119587704e-07, "kl": 0.0, "learning_rate": 4.175293305728088e-07, "loss": 0.0, "num_tokens": 18518861.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 529 }, { "completion_length": 921.0000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1732.0, "completions/max_terminated_length": 1732.0, "completions/mean_length": 921.0, "completions/mean_terminated_length": 921.0, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "epoch": 0.17978290366350066, "frac_reward_zero_std": 0.5, "grad_norm": 0.20871922373771667, "kl": 0.0, "learning_rate": 4.173567977915804e-07, "loss": -0.0013, "num_tokens": 18541703.0, "reward": 0.3541666865348816, "reward_std": 0.17205862700939178, "rewards/correctness_reward_func/mean": 0.06666667014360428, "rewards/correctness_reward_func/std": 0.23094011843204498, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 530 }, { "completion_length": 1214.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 4228.0, "completions/max_terminated_length": 4228.0, "completions/mean_length": 1214.75, "completions/mean_terminated_length": 1214.75, "completions/min_length": 581.0, "completions/min_terminated_length": 581.0, "epoch": 0.18012211668928088, "frac_reward_zero_std": 0.5, "grad_norm": 0.06504378467798233, "kl": 0.0, "learning_rate": 4.171842650103519e-07, "loss": -0.0003, "num_tokens": 18563744.0, "reward": 0.7749999761581421, "reward_std": 0.03872983902692795, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 531 }, { "completion_length": 1132.5000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1858.0, "completions/max_terminated_length": 1858.0, "completions/mean_length": 1132.5, "completions/mean_terminated_length": 1132.5, "completions/min_length": 598.0, "completions/min_terminated_length": 598.0, "epoch": 0.18046132971506107, "frac_reward_zero_std": 1.0, "grad_norm": 2.329216783891752e-07, "kl": 0.0, "learning_rate": 4.170117322291235e-07, "loss": 0.0, "num_tokens": 18589352.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 532 }, { "completion_length": 701.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 918.0, "completions/max_terminated_length": 918.0, "completions/mean_length": 701.6666870117188, "completions/mean_terminated_length": 701.6666870117188, "completions/min_length": 510.0, "completions/min_terminated_length": 510.0, "epoch": 0.18080054274084126, "frac_reward_zero_std": 0.5, "grad_norm": 0.12558390200138092, "kl": 0.0, "learning_rate": 4.1683919944789506e-07, "loss": -0.0, "num_tokens": 18610486.0, "reward": 0.7166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 533 }, { "completion_length": 1081.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1984.0, "completions/max_terminated_length": 1984.0, "completions/mean_length": 1081.666748046875, "completions/mean_terminated_length": 1081.666748046875, "completions/min_length": 496.0, "completions/min_terminated_length": 496.0, "epoch": 0.18113975576662145, "frac_reward_zero_std": 0.5, "grad_norm": 0.09250768274068832, "kl": 0.0, "learning_rate": 4.1666666666666667e-07, "loss": -0.0001, "num_tokens": 18636492.0, "reward": 0.75, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.44999995827674866, "rewards/correctness_reward_func/std": 0.47577688097953796, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 534 }, { "completion_length": 2280.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4509.0, "completions/max_terminated_length": 4509.0, "completions/mean_length": 2280.83349609375, "completions/mean_terminated_length": 2280.83349609375, "completions/min_length": 819.0, "completions/min_terminated_length": 819.0, "epoch": 0.18147896879240163, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.164941338854382e-07, "loss": 0.0, "num_tokens": 18671170.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 535 }, { "completion_length": 1914.8333740234375, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 4516.0, "completions/mean_length": 3013.0, "completions/mean_terminated_length": 2297.800048828125, "completions/min_length": 508.0, "completions/min_terminated_length": 508.0, "epoch": 0.18181818181818182, "frac_reward_zero_std": 0.0, "grad_norm": 0.438634991645813, "kl": NaN, "learning_rate": 4.1632160110420977e-07, "loss": -0.028, "num_tokens": 18707870.0, "reward": 1.0499999523162842, "reward_std": 0.3872982859611511, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.38138505816459656, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.11677484214305878, "step": 536 }, { "completion_length": 2529.666748046875, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 5630.0, "completions/mean_length": 4176.9169921875, "completions/mean_terminated_length": 3372.888916015625, "completions/min_length": 2035.0, "completions/min_terminated_length": 2035.0, "epoch": 0.182157394843962, "frac_reward_zero_std": 0.5, "grad_norm": 0.38183891773223877, "kl": NaN, "learning_rate": 4.161490683229814e-07, "loss": -0.0232, "num_tokens": 18749434.0, "reward": 0.6375000476837158, "reward_std": 0.07373940199613571, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.23750001192092896, "rewards/format_reward_func/std": 0.11894422769546509, "step": 537 }, { "completion_length": 2225.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4071.0, "completions/max_terminated_length": 4071.0, "completions/mean_length": 2225.33349609375, "completions/mean_terminated_length": 2225.33349609375, "completions/min_length": 822.0, "completions/min_terminated_length": 822.0, "epoch": 0.1824966078697422, "frac_reward_zero_std": 1.0, "grad_norm": 1.5497386129936785e-07, "kl": 0.0, "learning_rate": 4.1597653554175293e-07, "loss": 0.0, "num_tokens": 18788006.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 538 }, { "completion_length": 1238.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3105.0, "completions/max_terminated_length": 3105.0, "completions/mean_length": 1238.0833740234375, "completions/mean_terminated_length": 1238.0833740234375, "completions/min_length": 560.0, "completions/min_terminated_length": 560.0, "epoch": 0.1828358208955224, "frac_reward_zero_std": 1.0, "grad_norm": 2.6287909804523224e-07, "kl": 0.0, "learning_rate": 4.158040027605245e-07, "loss": 0.0, "num_tokens": 18813957.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 539 }, { "completion_length": 1780.666748046875, "completions/clipped_ratio": 0.33333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6236.0, "completions/mean_length": 3977.0, "completions/mean_terminated_length": 2671.0, "completions/min_length": 490.0, "completions/min_terminated_length": 490.0, "epoch": 0.18317503392130258, "frac_reward_zero_std": 0.0, "grad_norm": 0.5980116128921509, "kl": NaN, "learning_rate": 4.1563146997929604e-07, "loss": -0.0649, "num_tokens": 18847355.0, "reward": 0.7333333492279053, "reward_std": 0.27224498987197876, "rewards/correctness_reward_func/mean": 0.5333333015441895, "rewards/correctness_reward_func/std": 0.47736650705337524, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.14770980179309845, "step": 540 }, { "completion_length": 2010.916748046875, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 5844.0, "completions/mean_length": 3658.166748046875, "completions/mean_terminated_length": 2681.22216796875, "completions/min_length": 1292.0, "completions/min_terminated_length": 1292.0, "epoch": 0.18351424694708277, "frac_reward_zero_std": 0.0, "grad_norm": 4.284682750701904, "kl": NaN, "learning_rate": 4.1545893719806764e-07, "loss": -0.0252, "num_tokens": 18885052.0, "reward": 0.6750000715255737, "reward_std": 0.13693061470985413, "rewards/correctness_reward_func/mean": 0.44999995827674866, "rewards/correctness_reward_func/std": 0.47577688097953796, "rewards/format_reward_func/mean": 0.22500000894069672, "rewards/format_reward_func/std": 0.13568010926246643, "step": 541 }, { "completion_length": 2788.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 5735.0, "completions/max_terminated_length": 5735.0, "completions/mean_length": 2788.08349609375, "completions/mean_terminated_length": 2788.08349609375, "completions/min_length": 794.0, "completions/min_terminated_length": 794.0, "epoch": 0.18385345997286295, "frac_reward_zero_std": 0.0, "grad_norm": 0.6677839756011963, "kl": 0.0, "learning_rate": 4.1528640441683914e-07, "loss": -0.0025, "num_tokens": 18932045.0, "reward": 1.066666841506958, "reward_std": 0.2168930023908615, "rewards/correctness_reward_func/mean": 0.7666667103767395, "rewards/correctness_reward_func/std": 0.25346091389656067, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 542 }, { "completion_length": 1617.0001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 4218.0, "completions/max_terminated_length": 4218.0, "completions/mean_length": 1617.0, "completions/mean_terminated_length": 1617.0, "completions/min_length": 701.0, "completions/min_terminated_length": 701.0, "epoch": 0.18419267299864314, "frac_reward_zero_std": 1.0, "grad_norm": 9.618871388283878e-08, "kl": 0.0, "learning_rate": 4.1511387163561075e-07, "loss": 0.0, "num_tokens": 18965921.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 543 }, { "completion_length": 1418.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3041.0, "completions/max_terminated_length": 3041.0, "completions/mean_length": 1418.666748046875, "completions/mean_terminated_length": 1418.666748046875, "completions/min_length": 660.0, "completions/min_terminated_length": 660.0, "epoch": 0.18453188602442333, "frac_reward_zero_std": 0.5, "grad_norm": 0.2889324724674225, "kl": 0.0, "learning_rate": 4.149413388543823e-07, "loss": 0.0026, "num_tokens": 18996343.0, "reward": 0.833333432674408, "reward_std": 0.20655912160873413, "rewards/correctness_reward_func/mean": 0.5333333611488342, "rewards/correctness_reward_func/std": 0.3938928246498108, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 544 }, { "completion_length": 1246.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2340.0, "completions/max_terminated_length": 2340.0, "completions/mean_length": 1246.3333740234375, "completions/mean_terminated_length": 1246.3333740234375, "completions/min_length": 633.0, "completions/min_terminated_length": 633.0, "epoch": 0.18487109905020352, "frac_reward_zero_std": 0.5, "grad_norm": 0.43304693698883057, "kl": 0.0, "learning_rate": 4.147688060731539e-07, "loss": 0.0088, "num_tokens": 19020623.0, "reward": 1.1166667938232422, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.8166666030883789, "rewards/correctness_reward_func/std": 0.27579087018966675, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 545 }, { "completion_length": 1579.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4176.0, "completions/max_terminated_length": 4176.0, "completions/mean_length": 1579.0, "completions/mean_terminated_length": 1579.0, "completions/min_length": 531.0, "completions/min_terminated_length": 531.0, "epoch": 0.1852103120759837, "frac_reward_zero_std": 1.0, "grad_norm": 2.227987891956218e-07, "kl": 0.0, "learning_rate": 4.145962732919254e-07, "loss": 0.0, "num_tokens": 19051523.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 546 }, { "completion_length": 2309.0001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 6122.0, "completions/max_terminated_length": 6122.0, "completions/mean_length": 2309.0, "completions/mean_terminated_length": 2309.0, "completions/min_length": 819.0, "completions/min_terminated_length": 819.0, "epoch": 0.1855495251017639, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.14423740510697e-07, "loss": 0.0, "num_tokens": 19088027.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 547 }, { "completion_length": 1715.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3856.0, "completions/max_terminated_length": 3856.0, "completions/mean_length": 1715.0, "completions/mean_terminated_length": 1715.0, "completions/min_length": 691.0, "completions/min_terminated_length": 691.0, "epoch": 0.18588873812754408, "frac_reward_zero_std": 0.0, "grad_norm": 0.35062411427497864, "kl": 0.0, "learning_rate": 4.1425120772946856e-07, "loss": -0.0015, "num_tokens": 19123619.0, "reward": 0.8833333849906921, "reward_std": 0.26133137941360474, "rewards/correctness_reward_func/mean": 0.5833333134651184, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 548 }, { "completion_length": 1286.8333740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 3200.0, "completions/mean_length": 1835.916748046875, "completions/mean_terminated_length": 1403.8182373046875, "completions/min_length": 492.0, "completions/min_terminated_length": 492.0, "epoch": 0.1862279511533243, "frac_reward_zero_std": 0.0, "grad_norm": 0.7429186701774597, "kl": NaN, "learning_rate": 4.1407867494824017e-07, "loss": -0.0282, "num_tokens": 19155267.0, "reward": 0.8083333373069763, "reward_std": 0.5039968490600586, "rewards/correctness_reward_func/mean": 0.5333333611488342, "rewards/correctness_reward_func/std": 0.47736650705337524, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 549 }, { "completion_length": 2776.08349609375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6261.0, "completions/mean_length": 3325.166748046875, "completions/mean_terminated_length": 3028.45458984375, "completions/min_length": 966.0, "completions/min_terminated_length": 966.0, "epoch": 0.1865671641791045, "frac_reward_zero_std": 0.5, "grad_norm": 0.08030923455953598, "kl": NaN, "learning_rate": 4.1390614216701167e-07, "loss": -0.0084, "num_tokens": 19195720.0, "reward": 0.6875001788139343, "reward_std": 0.03061862289905548, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 550 }, { "completion_length": 2092.5833740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5555.0, "completions/mean_length": 2641.666748046875, "completions/mean_terminated_length": 2282.818359375, "completions/min_length": 431.0, "completions/min_terminated_length": 431.0, "epoch": 0.18690637720488468, "frac_reward_zero_std": 0.5, "grad_norm": 0.49874621629714966, "kl": NaN, "learning_rate": 4.137336093857833e-07, "loss": -0.0328, "num_tokens": 19236557.0, "reward": 1.1000001430511475, "reward_std": 0.23664319515228271, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.38138505816459656, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 551 }, { "completion_length": 2317.5833740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6479.0, "completions/mean_length": 2866.666748046875, "completions/mean_terminated_length": 2528.272705078125, "completions/min_length": 678.0, "completions/min_terminated_length": 678.0, "epoch": 0.18724559023066487, "frac_reward_zero_std": 0.5, "grad_norm": 0.5327326059341431, "kl": NaN, "learning_rate": 4.135610766045549e-07, "loss": -0.0418, "num_tokens": 19275342.0, "reward": 0.6916667819023132, "reward_std": 0.26536139845848083, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.5149286985397339, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 552 }, { "completion_length": 1518.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3452.0, "completions/max_terminated_length": 3452.0, "completions/mean_length": 1518.416748046875, "completions/mean_terminated_length": 1518.416748046875, "completions/min_length": 456.0, "completions/min_terminated_length": 456.0, "epoch": 0.18758480325644505, "frac_reward_zero_std": 0.0, "grad_norm": 0.5784144401550293, "kl": 0.0, "learning_rate": 4.1338854382332643e-07, "loss": -0.0123, "num_tokens": 19305695.0, "reward": 0.7833334803581238, "reward_std": 0.20412415266036987, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.43029236793518066, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 553 }, { "completion_length": 1451.5833740234375, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 5332.0, "completions/mean_length": 2549.75, "completions/mean_terminated_length": 1741.9000244140625, "completions/min_length": 495.0, "completions/min_terminated_length": 495.0, "epoch": 0.18792401628222524, "frac_reward_zero_std": 0.0, "grad_norm": 0.5653297901153564, "kl": NaN, "learning_rate": 4.13216011042098e-07, "loss": -0.0303, "num_tokens": 19336656.0, "reward": 0.9125000834465027, "reward_std": 0.31922924518585205, "rewards/correctness_reward_func/mean": 0.6499999761581421, "rewards/correctness_reward_func/std": 0.40113475918769836, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.09323723614215851, "step": 554 }, { "completion_length": 3387.5833740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5996.0, "completions/mean_length": 3936.666748046875, "completions/mean_terminated_length": 3695.545654296875, "completions/min_length": 2256.0, "completions/min_terminated_length": 2256.0, "epoch": 0.18826322930800543, "frac_reward_zero_std": 0.0, "grad_norm": 0.1428702175617218, "kl": NaN, "learning_rate": 4.1304347826086954e-07, "loss": -0.0036, "num_tokens": 19388881.0, "reward": 0.7541667819023132, "reward_std": 0.08225837349891663, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 555 }, { "completion_length": 723.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1403.0, "completions/max_terminated_length": 1403.0, "completions/mean_length": 723.4166870117188, "completions/mean_terminated_length": 723.4166870117188, "completions/min_length": 500.0, "completions/min_terminated_length": 500.0, "epoch": 0.18860244233378562, "frac_reward_zero_std": 1.0, "grad_norm": 1.9308046717014804e-07, "kl": 0.0, "learning_rate": 4.1287094547964115e-07, "loss": 0.0, "num_tokens": 19411362.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 556 }, { "completion_length": 995.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1785.0, "completions/max_terminated_length": 1785.0, "completions/mean_length": 995.0, "completions/mean_terminated_length": 995.0, "completions/min_length": 445.0, "completions/min_terminated_length": 445.0, "epoch": 0.1889416553595658, "frac_reward_zero_std": 1.0, "grad_norm": 3.172017670749483e-07, "kl": 0.0, "learning_rate": 4.1269841269841265e-07, "loss": 0.0, "num_tokens": 19436022.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 557 }, { "completion_length": 2856.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4397.0, "completions/max_terminated_length": 4397.0, "completions/mean_length": 2856.83349609375, "completions/mean_terminated_length": 2856.83349609375, "completions/min_length": 906.0, "completions/min_terminated_length": 906.0, "epoch": 0.189280868385346, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.1252587991718425e-07, "loss": 0.0, "num_tokens": 19481284.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 558 }, { "completion_length": 1833.5, "completions/clipped_ratio": 0.0, "completions/max_length": 5297.0, "completions/max_terminated_length": 5297.0, "completions/mean_length": 1833.5, "completions/mean_terminated_length": 1833.5, "completions/min_length": 596.0, "completions/min_terminated_length": 596.0, "epoch": 0.18962008141112618, "frac_reward_zero_std": 0.5, "grad_norm": 0.08774584531784058, "kl": 0.0, "learning_rate": 4.123533471359558e-07, "loss": -0.0012, "num_tokens": 19518106.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 559 }, { "completion_length": 1456.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2801.0, "completions/max_terminated_length": 2801.0, "completions/mean_length": 1456.75, "completions/mean_terminated_length": 1456.75, "completions/min_length": 743.0, "completions/min_terminated_length": 743.0, "epoch": 0.18995929443690637, "frac_reward_zero_std": 0.5, "grad_norm": 0.4802844822406769, "kl": 0.0, "learning_rate": 4.121808143547274e-07, "loss": 0.0104, "num_tokens": 19546603.0, "reward": 0.6666667461395264, "reward_std": 0.18618986010551453, "rewards/correctness_reward_func/mean": 0.36666664481163025, "rewards/correctness_reward_func/std": 0.45792677998542786, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 560 }, { "completion_length": 1429.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3128.0, "completions/max_terminated_length": 3128.0, "completions/mean_length": 1429.5833740234375, "completions/mean_terminated_length": 1429.5833740234375, "completions/min_length": 396.0, "completions/min_terminated_length": 396.0, "epoch": 0.19029850746268656, "frac_reward_zero_std": 0.0, "grad_norm": 0.36741065979003906, "kl": 0.0, "learning_rate": 4.120082815734989e-07, "loss": 0.0001, "num_tokens": 19574510.0, "reward": 0.595833420753479, "reward_std": 0.20437853038311005, "rewards/correctness_reward_func/mean": 0.3333333432674408, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.06784005463123322, "step": 561 }, { "completion_length": 1721.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3282.0, "completions/max_terminated_length": 3282.0, "completions/mean_length": 1721.0833740234375, "completions/mean_terminated_length": 1721.0833740234375, "completions/min_length": 762.0, "completions/min_terminated_length": 762.0, "epoch": 0.19063772048846675, "frac_reward_zero_std": 1.0, "grad_norm": 1.8861383921375818e-07, "kl": 0.0, "learning_rate": 4.118357487922705e-07, "loss": 0.0, "num_tokens": 19607967.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 562 }, { "completion_length": 1103.0000305175781, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 1976.0, "completions/mean_length": 1652.0833740234375, "completions/mean_terminated_length": 1203.272705078125, "completions/min_length": 562.0, "completions/min_terminated_length": 562.0, "epoch": 0.19097693351424694, "frac_reward_zero_std": 0.5, "grad_norm": 0.17974478006362915, "kl": NaN, "learning_rate": 4.1166321601104207e-07, "loss": -0.0182, "num_tokens": 19638069.0, "reward": 1.058333396911621, "reward_std": 0.25380438566207886, "rewards/correctness_reward_func/mean": 0.7833333015441895, "rewards/correctness_reward_func/std": 0.26227444410324097, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 563 }, { "completion_length": 660.5000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1087.0, "completions/max_terminated_length": 1087.0, "completions/mean_length": 660.5, "completions/mean_terminated_length": 660.5, "completions/min_length": 415.0, "completions/min_terminated_length": 415.0, "epoch": 0.19131614654002713, "frac_reward_zero_std": 1.0, "grad_norm": 1.9672070550313947e-07, "kl": 0.0, "learning_rate": 4.114906832298137e-07, "loss": 0.0, "num_tokens": 19662405.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 564 }, { "completion_length": 913.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2214.0, "completions/max_terminated_length": 2214.0, "completions/mean_length": 913.4166870117188, "completions/mean_terminated_length": 913.4166870117188, "completions/min_length": 504.0, "completions/min_terminated_length": 504.0, "epoch": 0.19165535956580732, "frac_reward_zero_std": 0.5, "grad_norm": 0.26728707551956177, "kl": 0.0, "learning_rate": 4.113181504485852e-07, "loss": -0.0042, "num_tokens": 19688930.0, "reward": 1.0333335399627686, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.7333333492279053, "rewards/correctness_reward_func/std": 0.23094011843204498, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 565 }, { "completion_length": 621.0000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 913.0, "completions/max_terminated_length": 913.0, "completions/mean_length": 621.0, "completions/mean_terminated_length": 621.0, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 0.1919945725915875, "frac_reward_zero_std": 1.0, "grad_norm": 8.458977163172676e-08, "kl": 0.0, "learning_rate": 4.111456176673568e-07, "loss": 0.0, "num_tokens": 19708724.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 566 }, { "completion_length": 2206.0001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 4598.0, "completions/max_terminated_length": 4598.0, "completions/mean_length": 2206.0, "completions/mean_terminated_length": 2206.0, "completions/min_length": 614.0, "completions/min_terminated_length": 614.0, "epoch": 0.19233378561736772, "frac_reward_zero_std": 0.0, "grad_norm": 0.29058995842933655, "kl": 0.0, "learning_rate": 4.109730848861284e-07, "loss": -0.0083, "num_tokens": 19746638.0, "reward": 1.1166666746139526, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.8166666030883789, "rewards/correctness_reward_func/std": 0.27579087018966675, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 567 }, { "completion_length": 1693.0834350585938, "completions/clipped_ratio": 0.0, "completions/max_length": 6508.0, "completions/max_terminated_length": 6508.0, "completions/mean_length": 1693.0833740234375, "completions/mean_terminated_length": 1693.0833740234375, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 0.1926729986431479, "frac_reward_zero_std": 0.5, "grad_norm": 0.44586482644081116, "kl": 0.0, "learning_rate": 4.108005521048999e-07, "loss": -0.0105, "num_tokens": 19778715.0, "reward": 0.7166666984558105, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.5149286985397339, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 568 }, { "completion_length": 2577.2501220703125, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 6125.0, "completions/mean_length": 3675.416748046875, "completions/mean_terminated_length": 3092.699951171875, "completions/min_length": 1502.0, "completions/min_terminated_length": 1502.0, "epoch": 0.1930122116689281, "frac_reward_zero_std": 0.0, "grad_norm": 0.7111157774925232, "kl": NaN, "learning_rate": 4.106280193236715e-07, "loss": -0.0213, "num_tokens": 19822152.0, "reward": 0.44166669249534607, "reward_std": 0.31943613290786743, "rewards/correctness_reward_func/mean": 0.1666666716337204, "rewards/correctness_reward_func/std": 0.38924944400787354, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 569 }, { "completion_length": 1982.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2881.0, "completions/max_terminated_length": 2881.0, "completions/mean_length": 1982.3333740234375, "completions/mean_terminated_length": 1982.3333740234375, "completions/min_length": 1068.0, "completions/min_terminated_length": 1068.0, "epoch": 0.19335142469470828, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.1045548654244304e-07, "loss": 0.0, "num_tokens": 19858936.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 570 }, { "completion_length": 1354.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2324.0, "completions/max_terminated_length": 2324.0, "completions/mean_length": 1354.416748046875, "completions/mean_terminated_length": 1354.416748046875, "completions/min_length": 844.0, "completions/min_terminated_length": 844.0, "epoch": 0.19369063772048847, "frac_reward_zero_std": 1.0, "grad_norm": 1.0446248666085012e-07, "kl": 0.0, "learning_rate": 4.1028295376121465e-07, "loss": 0.0, "num_tokens": 19888419.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 571 }, { "completion_length": 2613.5001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 4610.0, "completions/max_terminated_length": 4610.0, "completions/mean_length": 2613.5, "completions/mean_terminated_length": 2613.5, "completions/min_length": 714.0, "completions/min_terminated_length": 714.0, "epoch": 0.19402985074626866, "frac_reward_zero_std": 1.0, "grad_norm": 1.4929959490928013e-07, "kl": 0.0, "learning_rate": 4.1011042097998615e-07, "loss": 0.0, "num_tokens": 19931457.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 572 }, { "completion_length": 1108.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2757.0, "completions/max_terminated_length": 2757.0, "completions/mean_length": 1108.8333740234375, "completions/mean_terminated_length": 1108.8333740234375, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "epoch": 0.19436906377204885, "frac_reward_zero_std": 0.5, "grad_norm": 0.05309825763106346, "kl": 0.0, "learning_rate": 4.0993788819875776e-07, "loss": -0.0001, "num_tokens": 19959157.0, "reward": 1.2166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298573225736618, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 573 }, { "completion_length": 811.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2012.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 811.0, "completions/mean_terminated_length": 811.0, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "epoch": 0.19470827679782904, "frac_reward_zero_std": 0.5, "grad_norm": 0.2551961839199066, "kl": 0.0, "learning_rate": 4.097653554175293e-07, "loss": -0.001, "num_tokens": 19981459.0, "reward": 1.0333335399627686, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.7333333492279053, "rewards/correctness_reward_func/std": 0.2309401035308838, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 574 }, { "completion_length": 1731.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3343.0, "completions/max_terminated_length": 3343.0, "completions/mean_length": 1731.666748046875, "completions/mean_terminated_length": 1731.666748046875, "completions/min_length": 636.0, "completions/min_terminated_length": 636.0, "epoch": 0.19504748982360923, "frac_reward_zero_std": 0.0, "grad_norm": 0.12224812805652618, "kl": 0.0, "learning_rate": 4.095928226363009e-07, "loss": 0.0014, "num_tokens": 20017251.0, "reward": 1.2000000476837158, "reward_std": 0.08164963126182556, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 575 }, { "completion_length": 2264.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4903.0, "completions/max_terminated_length": 4903.0, "completions/mean_length": 2264.916748046875, "completions/mean_terminated_length": 2264.916748046875, "completions/min_length": 673.0, "completions/min_terminated_length": 673.0, "epoch": 0.19538670284938942, "frac_reward_zero_std": 0.5, "grad_norm": 0.07465171068906784, "kl": 0.0, "learning_rate": 4.094202898550724e-07, "loss": 0.0004, "num_tokens": 20059640.0, "reward": 1.133333444595337, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 576 }, { "completion_length": 711.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1438.0, "completions/max_terminated_length": 1438.0, "completions/mean_length": 711.9166870117188, "completions/mean_terminated_length": 711.9166870117188, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.1957259158751696, "frac_reward_zero_std": 0.5, "grad_norm": 0.3539010286331177, "kl": 0.0, "learning_rate": 4.09247757073844e-07, "loss": -0.0026, "num_tokens": 20076679.0, "reward": 1.000000238418579, "reward_std": 0.23664319515228271, "rewards/correctness_reward_func/mean": 0.699999988079071, "rewards/correctness_reward_func/std": 0.3357488214969635, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 577 }, { "completion_length": 2636.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5236.0, "completions/max_terminated_length": 5236.0, "completions/mean_length": 2636.0, "completions/mean_terminated_length": 2636.0, "completions/min_length": 934.0, "completions/min_terminated_length": 934.0, "epoch": 0.1960651289009498, "frac_reward_zero_std": 0.0, "grad_norm": 1.1721926927566528, "kl": 0.0, "learning_rate": 4.0907522429261557e-07, "loss": 0.0262, "num_tokens": 20117005.0, "reward": 0.8041667342185974, "reward_std": 0.43039870262145996, "rewards/correctness_reward_func/mean": 0.5166666507720947, "rewards/correctness_reward_func/std": 0.4628632962703705, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 578 }, { "completion_length": 1920.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4433.0, "completions/max_terminated_length": 4433.0, "completions/mean_length": 1920.5833740234375, "completions/mean_terminated_length": 1920.5833740234375, "completions/min_length": 811.0, "completions/min_terminated_length": 811.0, "epoch": 0.19640434192672998, "frac_reward_zero_std": 0.5, "grad_norm": 0.5124653577804565, "kl": 0.0, "learning_rate": 4.089026915113871e-07, "loss": -0.0106, "num_tokens": 20153522.0, "reward": 1.0833332538604736, "reward_std": 0.222860187292099, "rewards/correctness_reward_func/mean": 0.7833333015441895, "rewards/correctness_reward_func/std": 0.37618499994277954, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 579 }, { "completion_length": 1807.25, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 6471.0, "completions/mean_length": 3454.5, "completions/mean_terminated_length": 2409.666748046875, "completions/min_length": 540.0, "completions/min_terminated_length": 540.0, "epoch": 0.19674355495251017, "frac_reward_zero_std": 0.5, "grad_norm": 0.17243024706840515, "kl": NaN, "learning_rate": 4.087301587301587e-07, "loss": -0.0296, "num_tokens": 20189837.0, "reward": 0.6125000715255737, "reward_std": 0.07373940199613571, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.21250002086162567, "rewards/format_reward_func/std": 0.13505050539970398, "step": 580 }, { "completion_length": 2128.25, "completions/clipped_ratio": 0.0, "completions/max_length": 4383.0, "completions/max_terminated_length": 4383.0, "completions/mean_length": 2128.25, "completions/mean_terminated_length": 2128.25, "completions/min_length": 700.0, "completions/min_terminated_length": 700.0, "epoch": 0.19708276797829036, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.085576259489303e-07, "loss": 0.0, "num_tokens": 20230106.0, "reward": 0.30000001192092896, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 581 }, { "completion_length": 2194.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4675.0, "completions/max_terminated_length": 4675.0, "completions/mean_length": 2194.83349609375, "completions/mean_terminated_length": 2194.83349609375, "completions/min_length": 799.0, "completions/min_terminated_length": 799.0, "epoch": 0.19742198100407055, "frac_reward_zero_std": 0.0, "grad_norm": 0.6451249122619629, "kl": 0.0, "learning_rate": 4.083850931677019e-07, "loss": -0.0096, "num_tokens": 20267946.0, "reward": 1.070833444595337, "reward_std": 0.29760777950286865, "rewards/correctness_reward_func/mean": 0.7833333015441895, "rewards/correctness_reward_func/std": 0.37618499994277954, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 582 }, { "completion_length": 1643.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3262.0, "completions/max_terminated_length": 3262.0, "completions/mean_length": 1643.916748046875, "completions/mean_terminated_length": 1643.916748046875, "completions/min_length": 862.0, "completions/min_terminated_length": 862.0, "epoch": 0.19776119402985073, "frac_reward_zero_std": 0.0, "grad_norm": 0.13853637874126434, "kl": 0.0, "learning_rate": 4.082125603864734e-07, "loss": -0.0048, "num_tokens": 20298029.0, "reward": 1.2208333015441895, "reward_std": 0.10064341127872467, "rewards/correctness_reward_func/mean": 0.9333333373069763, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 583 }, { "completion_length": 2544.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5490.0, "completions/max_terminated_length": 5490.0, "completions/mean_length": 2544.166748046875, "completions/mean_terminated_length": 2544.166748046875, "completions/min_length": 862.0, "completions/min_terminated_length": 862.0, "epoch": 0.19810040705563092, "frac_reward_zero_std": 0.5, "grad_norm": 0.7164510488510132, "kl": 0.0, "learning_rate": 4.08040027605245e-07, "loss": 0.0183, "num_tokens": 20340841.0, "reward": 1.1166666746139526, "reward_std": 0.24832773208618164, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.3857302963733673, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 584 }, { "completion_length": 843.9166870117188, "completions/clipped_ratio": 0.5, "completions/max_length": 6589.0, "completions/max_terminated_length": 2172.0, "completions/mean_length": 4138.4169921875, "completions/mean_terminated_length": 1687.8333740234375, "completions/min_length": 1129.0, "completions/min_terminated_length": 1129.0, "epoch": 0.19843962008141114, "frac_reward_zero_std": 0.5, "grad_norm": 0.10080356895923615, "kl": NaN, "learning_rate": 4.0786749482401655e-07, "loss": -0.001, "num_tokens": 20363022.0, "reward": 0.5833333730697632, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.43333330750465393, "rewards/correctness_reward_func/std": 0.45792683959007263, "rewards/format_reward_func/mean": 0.15000000596046448, "rewards/format_reward_func/std": 0.15666989982128143, "step": 585 }, { "completion_length": 1471.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 2505.0, "completions/max_terminated_length": 2505.0, "completions/mean_length": 1471.416748046875, "completions/mean_terminated_length": 1471.416748046875, "completions/min_length": 909.0, "completions/min_terminated_length": 909.0, "epoch": 0.19877883310719133, "frac_reward_zero_std": 0.5, "grad_norm": 0.07403448224067688, "kl": 0.0, "learning_rate": 4.0769496204278815e-07, "loss": -0.0001, "num_tokens": 20390729.0, "reward": 0.7166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 586 }, { "completion_length": 1442.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2322.0, "completions/max_terminated_length": 2322.0, "completions/mean_length": 1442.5833740234375, "completions/mean_terminated_length": 1442.5833740234375, "completions/min_length": 698.0, "completions/min_terminated_length": 698.0, "epoch": 0.19911804613297152, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.0752242926155965e-07, "loss": 0.0, "num_tokens": 20417646.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 587 }, { "completion_length": 1039.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1718.0, "completions/max_terminated_length": 1718.0, "completions/mean_length": 1039.416748046875, "completions/mean_terminated_length": 1039.416748046875, "completions/min_length": 724.0, "completions/min_terminated_length": 724.0, "epoch": 0.1994572591587517, "frac_reward_zero_std": 0.5, "grad_norm": 0.09593465924263, "kl": 0.0, "learning_rate": 4.0734989648033126e-07, "loss": -0.0003, "num_tokens": 20442431.0, "reward": 1.2666666507720947, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 588 }, { "completion_length": 3065.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 6188.0, "completions/max_terminated_length": 6188.0, "completions/mean_length": 3065.416748046875, "completions/mean_terminated_length": 3065.416748046875, "completions/min_length": 804.0, "completions/min_terminated_length": 804.0, "epoch": 0.1997964721845319, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.071773636991028e-07, "loss": 0.0, "num_tokens": 20492140.0, "reward": 0.30000001192092896, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 589 }, { "completion_length": 704.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 965.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 704.8333740234375, "completions/mean_terminated_length": 704.8333740234375, "completions/min_length": 576.0, "completions/min_terminated_length": 576.0, "epoch": 0.20013568521031208, "frac_reward_zero_std": 1.0, "grad_norm": 2.2550463540937926e-07, "kl": 0.0, "learning_rate": 4.0700483091787437e-07, "loss": 0.0, "num_tokens": 20516426.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 590 }, { "completion_length": 1735.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3626.0, "completions/max_terminated_length": 3626.0, "completions/mean_length": 1735.8333740234375, "completions/mean_terminated_length": 1735.8333740234375, "completions/min_length": 1012.0, "completions/min_terminated_length": 1012.0, "epoch": 0.20047489823609227, "frac_reward_zero_std": 0.5, "grad_norm": 0.45365244150161743, "kl": 0.0, "learning_rate": 4.068322981366459e-07, "loss": 0.0057, "num_tokens": 20547852.0, "reward": 0.5666667222976685, "reward_std": 0.20655912160873413, "rewards/correctness_reward_func/mean": 0.2666666805744171, "rewards/correctness_reward_func/std": 0.393892765045166, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 591 }, { "completion_length": 1736.8333740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4041.0, "completions/mean_length": 2285.916748046875, "completions/mean_terminated_length": 1894.727294921875, "completions/min_length": 598.0, "completions/min_terminated_length": 598.0, "epoch": 0.20081411126187246, "frac_reward_zero_std": 0.0, "grad_norm": 0.2545448839664459, "kl": NaN, "learning_rate": 4.066597653554175e-07, "loss": -0.034, "num_tokens": 20578780.0, "reward": 1.0750000476837158, "reward_std": 0.29088661074638367, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.2696799635887146, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 592 }, { "completion_length": 1546.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2271.0, "completions/max_terminated_length": 2271.0, "completions/mean_length": 1546.916748046875, "completions/mean_terminated_length": 1546.916748046875, "completions/min_length": 638.0, "completions/min_terminated_length": 638.0, "epoch": 0.20115332428765265, "frac_reward_zero_std": 0.5, "grad_norm": 0.09835981577634811, "kl": 0.0, "learning_rate": 4.064872325741891e-07, "loss": -0.0017, "num_tokens": 20607141.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 593 }, { "completion_length": 2359.0833740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4462.0, "completions/mean_length": 2908.166748046875, "completions/mean_terminated_length": 2573.54541015625, "completions/min_length": 781.0, "completions/min_terminated_length": 781.0, "epoch": 0.20149253731343283, "frac_reward_zero_std": 0.5, "grad_norm": 0.07446026802062988, "kl": NaN, "learning_rate": 4.0631469979296063e-07, "loss": -0.004, "num_tokens": 20643640.0, "reward": 0.27500003576278687, "reward_std": 0.06123724579811096, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 594 }, { "completion_length": 1952.8333740234375, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 6144.0, "completions/mean_length": 3051.0, "completions/mean_terminated_length": 2343.400146484375, "completions/min_length": 534.0, "completions/min_terminated_length": 534.0, "epoch": 0.20183175033921302, "frac_reward_zero_std": 0.5, "grad_norm": 0.30042797327041626, "kl": NaN, "learning_rate": 4.061421670117322e-07, "loss": -0.0247, "num_tokens": 20680088.0, "reward": 0.6375000476837158, "reward_std": 0.07373940199613571, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.23750001192092896, "rewards/format_reward_func/std": 0.11894422769546509, "step": 595 }, { "completion_length": 1296.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2377.0, "completions/max_terminated_length": 2377.0, "completions/mean_length": 1296.25, "completions/mean_terminated_length": 1296.25, "completions/min_length": 656.0, "completions/min_terminated_length": 656.0, "epoch": 0.2021709633649932, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.059696342305038e-07, "loss": 0.0, "num_tokens": 20705621.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 596 }, { "completion_length": 1768.0833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 4853.0, "completions/max_terminated_length": 4853.0, "completions/mean_length": 1768.0833740234375, "completions/mean_terminated_length": 1768.0833740234375, "completions/min_length": 545.0, "completions/min_terminated_length": 545.0, "epoch": 0.2025101763907734, "frac_reward_zero_std": 1.0, "grad_norm": 2.6289086463293643e-07, "kl": 0.0, "learning_rate": 4.057971014492754e-07, "loss": 0.0, "num_tokens": 20738388.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 597 }, { "completion_length": 2543.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4530.0, "completions/max_terminated_length": 4530.0, "completions/mean_length": 2543.0, "completions/mean_terminated_length": 2543.0, "completions/min_length": 1381.0, "completions/min_terminated_length": 1381.0, "epoch": 0.2028493894165536, "frac_reward_zero_std": 1.0, "grad_norm": 1.6963534221758891e-07, "kl": 0.0, "learning_rate": 4.056245686680469e-07, "loss": 0.0, "num_tokens": 20782416.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 598 }, { "completion_length": 1435.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2731.0, "completions/max_terminated_length": 2731.0, "completions/mean_length": 1435.3333740234375, "completions/mean_terminated_length": 1435.3333740234375, "completions/min_length": 720.0, "completions/min_terminated_length": 720.0, "epoch": 0.20318860244233378, "frac_reward_zero_std": 0.5, "grad_norm": 0.1025226041674614, "kl": 0.0, "learning_rate": 4.054520358868185e-07, "loss": 0.0002, "num_tokens": 20813830.0, "reward": 1.1500000953674316, "reward_std": 0.05477222427725792, "rewards/correctness_reward_func/mean": 0.8500000834465027, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 599 }, { "completion_length": 794.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1313.0, "completions/max_terminated_length": 1313.0, "completions/mean_length": 794.75, "completions/mean_terminated_length": 794.75, "completions/min_length": 557.0, "completions/min_terminated_length": 557.0, "epoch": 0.20352781546811397, "frac_reward_zero_std": 0.5, "grad_norm": 0.06588659435510635, "kl": 0.0, "learning_rate": 4.0527950310559005e-07, "loss": 0.0002, "num_tokens": 20837977.0, "reward": 1.2666666507720947, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 600 }, { "completion_length": 1211.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1856.0, "completions/max_terminated_length": 1856.0, "completions/mean_length": 1211.3333740234375, "completions/mean_terminated_length": 1211.3333740234375, "completions/min_length": 832.0, "completions/min_terminated_length": 832.0, "epoch": 0.20386702849389415, "frac_reward_zero_std": 1.0, "grad_norm": 1.0307384457064472e-07, "kl": 0.0, "learning_rate": 4.051069703243616e-07, "loss": 0.0, "num_tokens": 20861033.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 601 }, { "completion_length": 2811.416748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6511.0, "completions/mean_length": 3360.5, "completions/mean_terminated_length": 3067.0, "completions/min_length": 1628.0, "completions/min_terminated_length": 1628.0, "epoch": 0.20420624151967434, "frac_reward_zero_std": 0.0, "grad_norm": 0.17239151895046234, "kl": NaN, "learning_rate": 4.0493443754313316e-07, "loss": -0.0131, "num_tokens": 20907724.0, "reward": 1.2208333015441895, "reward_std": 0.1265007108449936, "rewards/correctness_reward_func/mean": 0.9333333373069763, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 602 }, { "completion_length": 2544.7501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 5494.0, "completions/max_terminated_length": 5494.0, "completions/mean_length": 2544.75, "completions/mean_terminated_length": 2544.75, "completions/min_length": 660.0, "completions/min_terminated_length": 660.0, "epoch": 0.20454545454545456, "frac_reward_zero_std": 0.0, "grad_norm": 0.5830628871917725, "kl": 0.0, "learning_rate": 4.0476190476190476e-07, "loss": -0.0089, "num_tokens": 20947273.0, "reward": 0.9666666984558105, "reward_std": 0.2707287669181824, "rewards/correctness_reward_func/mean": 0.6666666865348816, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 603 }, { "completion_length": 2008.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3416.0, "completions/max_terminated_length": 3416.0, "completions/mean_length": 2008.25, "completions/mean_terminated_length": 2008.25, "completions/min_length": 621.0, "completions/min_terminated_length": 621.0, "epoch": 0.20488466757123475, "frac_reward_zero_std": 0.5, "grad_norm": 0.4334089159965515, "kl": 0.0, "learning_rate": 4.045893719806763e-07, "loss": -0.0058, "num_tokens": 20984080.0, "reward": 0.6208333969116211, "reward_std": 0.19391795992851257, "rewards/correctness_reward_func/mean": 0.3333333432674408, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 604 }, { "completion_length": 1733.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3784.0, "completions/max_terminated_length": 3784.0, "completions/mean_length": 1733.0, "completions/mean_terminated_length": 1733.0, "completions/min_length": 863.0, "completions/min_terminated_length": 863.0, "epoch": 0.20522388059701493, "frac_reward_zero_std": 0.5, "grad_norm": 0.4287903904914856, "kl": 0.0, "learning_rate": 4.0441683919944787e-07, "loss": 0.0233, "num_tokens": 21015982.0, "reward": 0.833333432674408, "reward_std": 0.20655912160873413, "rewards/correctness_reward_func/mean": 0.5333333611488342, "rewards/correctness_reward_func/std": 0.3938928246498108, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 605 }, { "completion_length": 2514.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4962.0, "completions/max_terminated_length": 4962.0, "completions/mean_length": 2514.166748046875, "completions/mean_terminated_length": 2514.166748046875, "completions/min_length": 1261.0, "completions/min_terminated_length": 1261.0, "epoch": 0.20556309362279512, "frac_reward_zero_std": 0.0, "grad_norm": 0.6668354272842407, "kl": 0.0, "learning_rate": 4.042443064182194e-07, "loss": 0.03, "num_tokens": 21056592.0, "reward": 1.070833444595337, "reward_std": 0.2486901879310608, "rewards/correctness_reward_func/mean": 0.7833333015441895, "rewards/correctness_reward_func/std": 0.26227444410324097, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 606 }, { "completion_length": 1062.1666870117188, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 3721.0, "completions/mean_length": 2160.33349609375, "completions/mean_terminated_length": 1274.5999755859375, "completions/min_length": 367.0, "completions/min_terminated_length": 367.0, "epoch": 0.2059023066485753, "frac_reward_zero_std": 0.5, "grad_norm": 0.10183367133140564, "kl": NaN, "learning_rate": 4.0407177363699103e-07, "loss": -0.0125, "num_tokens": 21083894.0, "reward": 0.6500000953674316, "reward_std": 0.0774596780538559, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.11677484214305878, "step": 607 }, { "completion_length": 2220.5001220703125, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6247.0, "completions/mean_length": 2769.58349609375, "completions/mean_terminated_length": 2422.36376953125, "completions/min_length": 623.0, "completions/min_terminated_length": 623.0, "epoch": 0.2062415196743555, "frac_reward_zero_std": 0.0, "grad_norm": 0.11050547659397125, "kl": NaN, "learning_rate": 4.038992408557626e-07, "loss": -0.0143, "num_tokens": 21120482.0, "reward": 0.7416666746139526, "reward_std": 0.1128769963979721, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 608 }, { "completion_length": 1264.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1872.0, "completions/max_terminated_length": 1872.0, "completions/mean_length": 1264.5833740234375, "completions/mean_terminated_length": 1264.5833740234375, "completions/min_length": 664.0, "completions/min_terminated_length": 664.0, "epoch": 0.2065807327001357, "frac_reward_zero_std": 1.0, "grad_norm": 1.325391423279143e-07, "kl": 0.0, "learning_rate": 4.0372670807453413e-07, "loss": 0.0, "num_tokens": 21142509.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 609 }, { "completion_length": 1594.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3316.0, "completions/max_terminated_length": 3316.0, "completions/mean_length": 1594.5, "completions/mean_terminated_length": 1594.5, "completions/min_length": 465.0, "completions/min_terminated_length": 465.0, "epoch": 0.20691994572591588, "frac_reward_zero_std": 1.0, "grad_norm": 9.580161020039668e-08, "kl": 0.0, "learning_rate": 4.035541752933057e-07, "loss": 0.0, "num_tokens": 21176565.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 610 }, { "completion_length": 2312.916748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5161.0, "completions/mean_length": 2862.0, "completions/mean_terminated_length": 2523.181884765625, "completions/min_length": 646.0, "completions/min_terminated_length": 646.0, "epoch": 0.20725915875169607, "frac_reward_zero_std": 0.5, "grad_norm": 0.09654112905263901, "kl": NaN, "learning_rate": 4.033816425120773e-07, "loss": -0.0072, "num_tokens": 21219494.0, "reward": 0.6750000715255737, "reward_std": 0.06123724579811096, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 611 }, { "completion_length": 1352.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 2310.0, "completions/max_terminated_length": 2310.0, "completions/mean_length": 1352.666748046875, "completions/mean_terminated_length": 1352.666748046875, "completions/min_length": 523.0, "completions/min_terminated_length": 523.0, "epoch": 0.20759837177747625, "frac_reward_zero_std": 1.0, "grad_norm": 1.9667629658215446e-07, "kl": 0.0, "learning_rate": 4.032091097308488e-07, "loss": 0.0, "num_tokens": 21250126.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 612 }, { "completion_length": 2175.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4093.0, "completions/max_terminated_length": 4093.0, "completions/mean_length": 2175.58349609375, "completions/mean_terminated_length": 2175.58349609375, "completions/min_length": 1203.0, "completions/min_terminated_length": 1203.0, "epoch": 0.20793758480325644, "frac_reward_zero_std": 0.0, "grad_norm": 0.5248443484306335, "kl": 0.0, "learning_rate": 4.030365769496204e-07, "loss": 0.0088, "num_tokens": 21281873.0, "reward": 1.1666667461395264, "reward_std": 0.2588964104652405, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.287096232175827, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 613 }, { "completion_length": 2334.75, "completions/clipped_ratio": 0.33333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6393.0, "completions/mean_length": 4531.08349609375, "completions/mean_terminated_length": 3502.125, "completions/min_length": 880.0, "completions/min_terminated_length": 880.0, "epoch": 0.20827679782903663, "frac_reward_zero_std": 0.5, "grad_norm": 0.24544914066791534, "kl": NaN, "learning_rate": 4.02864044168392e-07, "loss": -0.0219, "num_tokens": 21318176.0, "reward": 0.6000000834465027, "reward_std": 0.0774596780538559, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.14770980179309845, "step": 614 }, { "completion_length": 606.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1353.0, "completions/max_terminated_length": 1353.0, "completions/mean_length": 606.3333740234375, "completions/mean_terminated_length": 606.3333740234375, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.20861601085481682, "frac_reward_zero_std": 0.5, "grad_norm": 0.37288540601730347, "kl": 0.0, "learning_rate": 4.0269151138716356e-07, "loss": -0.0039, "num_tokens": 21336690.0, "reward": 1.1041667461395264, "reward_std": 0.23474276065826416, "rewards/correctness_reward_func/mean": 0.8166666030883789, "rewards/correctness_reward_func/std": 0.27579087018966675, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 615 }, { "completion_length": 1619.416748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4832.0, "completions/mean_length": 2168.5, "completions/mean_terminated_length": 1766.636474609375, "completions/min_length": 653.0, "completions/min_terminated_length": 653.0, "epoch": 0.208955223880597, "frac_reward_zero_std": 0.5, "grad_norm": 0.07448533922433853, "kl": NaN, "learning_rate": 4.025189786059351e-07, "loss": -0.0086, "num_tokens": 21372959.0, "reward": 0.7749999761581421, "reward_std": 0.06123724579811096, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 616 }, { "completion_length": 2493.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5083.0, "completions/max_terminated_length": 5083.0, "completions/mean_length": 2493.75, "completions/mean_terminated_length": 2493.75, "completions/min_length": 848.0, "completions/min_terminated_length": 848.0, "epoch": 0.2092944369063772, "frac_reward_zero_std": 1.0, "grad_norm": 1.4945359794182878e-07, "kl": 0.0, "learning_rate": 4.0234644582470666e-07, "loss": 0.0, "num_tokens": 21419780.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 617 }, { "completion_length": 1921.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2628.0, "completions/max_terminated_length": 2628.0, "completions/mean_length": 1921.0833740234375, "completions/mean_terminated_length": 1921.0833740234375, "completions/min_length": 1042.0, "completions/min_terminated_length": 1042.0, "epoch": 0.20963364993215738, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.0217391304347827e-07, "loss": 0.0, "num_tokens": 21449523.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 618 }, { "completion_length": 701.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1136.0, "completions/max_terminated_length": 1136.0, "completions/mean_length": 701.75, "completions/mean_terminated_length": 701.75, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "epoch": 0.20997286295793757, "frac_reward_zero_std": 0.5, "grad_norm": 0.060739900916814804, "kl": 0.0, "learning_rate": 4.020013802622498e-07, "loss": 0.0002, "num_tokens": 21467394.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 619 }, { "completion_length": 1590.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3286.0, "completions/max_terminated_length": 3286.0, "completions/mean_length": 1590.416748046875, "completions/mean_terminated_length": 1590.416748046875, "completions/min_length": 756.0, "completions/min_terminated_length": 756.0, "epoch": 0.21031207598371776, "frac_reward_zero_std": 0.0, "grad_norm": 0.14383164048194885, "kl": 0.0, "learning_rate": 4.018288474810214e-07, "loss": 0.0018, "num_tokens": 21492803.0, "reward": 1.183333396911621, "reward_std": 0.09246458858251572, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 620 }, { "completion_length": 2816.75, "completions/clipped_ratio": 0.0, "completions/max_length": 4851.0, "completions/max_terminated_length": 4851.0, "completions/mean_length": 2816.75, "completions/mean_terminated_length": 2816.75, "completions/min_length": 1029.0, "completions/min_terminated_length": 1029.0, "epoch": 0.21065128900949798, "frac_reward_zero_std": 0.5, "grad_norm": 0.1251380890607834, "kl": 0.0, "learning_rate": 4.0165631469979293e-07, "loss": -0.0009, "num_tokens": 21541196.0, "reward": 1.133333444595337, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 621 }, { "completion_length": 977.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1705.0, "completions/max_terminated_length": 1705.0, "completions/mean_length": 977.25, "completions/mean_terminated_length": 977.25, "completions/min_length": 595.0, "completions/min_terminated_length": 595.0, "epoch": 0.21099050203527817, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.0148378191856453e-07, "loss": 0.0, "num_tokens": 21562223.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 622 }, { "completion_length": 1879.0834350585938, "completions/clipped_ratio": 0.0, "completions/max_length": 4423.0, "completions/max_terminated_length": 4423.0, "completions/mean_length": 1879.0833740234375, "completions/mean_terminated_length": 1879.0833740234375, "completions/min_length": 489.0, "completions/min_terminated_length": 489.0, "epoch": 0.21132971506105835, "frac_reward_zero_std": 0.0, "grad_norm": 0.135577991604805, "kl": 0.0, "learning_rate": 4.0131124913733603e-07, "loss": -0.0018, "num_tokens": 21602580.0, "reward": 1.1541666984558105, "reward_std": 0.06497842073440552, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 623 }, { "completion_length": 2042.5, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 6396.0, "completions/mean_length": 3689.75, "completions/mean_terminated_length": 2723.333251953125, "completions/min_length": 858.0, "completions/min_terminated_length": 858.0, "epoch": 0.21166892808683854, "frac_reward_zero_std": 0.0, "grad_norm": 1.0774798393249512, "kl": NaN, "learning_rate": 4.0113871635610764e-07, "loss": -0.1023, "num_tokens": 21640638.0, "reward": 0.8916667699813843, "reward_std": 0.3968444764614105, "rewards/correctness_reward_func/mean": 0.6666666269302368, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.22500000894069672, "rewards/format_reward_func/std": 0.13568010926246643, "step": 624 }, { "completion_length": 1797.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4817.0, "completions/max_terminated_length": 4817.0, "completions/mean_length": 1797.5833740234375, "completions/mean_terminated_length": 1797.5833740234375, "completions/min_length": 551.0, "completions/min_terminated_length": 551.0, "epoch": 0.21200814111261873, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.009661835748792e-07, "loss": 0.0, "num_tokens": 21676483.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 625 }, { "completion_length": 2595.0001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 5171.0, "completions/max_terminated_length": 5171.0, "completions/mean_length": 2595.0, "completions/mean_terminated_length": 2595.0, "completions/min_length": 508.0, "completions/min_terminated_length": 508.0, "epoch": 0.21234735413839892, "frac_reward_zero_std": 0.0, "grad_norm": 0.4653422236442566, "kl": 0.0, "learning_rate": 4.007936507936508e-07, "loss": -0.0322, "num_tokens": 21721933.0, "reward": 1.1708333492279053, "reward_std": 0.2519000172615051, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 626 }, { "completion_length": 768.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1599.0, "completions/max_terminated_length": 1599.0, "completions/mean_length": 768.5, "completions/mean_terminated_length": 768.5, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "epoch": 0.2126865671641791, "frac_reward_zero_std": 0.5, "grad_norm": 0.05752098560333252, "kl": 0.0, "learning_rate": 4.006211180124223e-07, "loss": -0.0001, "num_tokens": 21742123.0, "reward": 0.7166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 627 }, { "completion_length": 1799.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3195.0, "completions/max_terminated_length": 3195.0, "completions/mean_length": 1799.0833740234375, "completions/mean_terminated_length": 1799.0833740234375, "completions/min_length": 731.0, "completions/min_terminated_length": 731.0, "epoch": 0.2130257801899593, "frac_reward_zero_std": 0.5, "grad_norm": 0.11317743360996246, "kl": 0.0, "learning_rate": 4.004485852311939e-07, "loss": 0.0022, "num_tokens": 21775028.0, "reward": 0.7041667699813843, "reward_std": 0.05571504682302475, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 628 }, { "completion_length": 2136.5834350585938, "completions/clipped_ratio": 0.0, "completions/max_length": 4480.0, "completions/max_terminated_length": 4480.0, "completions/mean_length": 2136.58349609375, "completions/mean_terminated_length": 2136.58349609375, "completions/min_length": 461.0, "completions/min_terminated_length": 461.0, "epoch": 0.21336499321573948, "frac_reward_zero_std": 1.0, "grad_norm": 1.4856317420708365e-07, "kl": 0.0, "learning_rate": 4.002760524499655e-07, "loss": 0.0, "num_tokens": 21813543.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 629 }, { "completion_length": 1928.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 5380.0, "completions/max_terminated_length": 5380.0, "completions/mean_length": 1928.5833740234375, "completions/mean_terminated_length": 1928.5833740234375, "completions/min_length": 874.0, "completions/min_terminated_length": 874.0, "epoch": 0.21370420624151967, "frac_reward_zero_std": 0.5, "grad_norm": 0.1090475544333458, "kl": 0.0, "learning_rate": 4.0010351966873706e-07, "loss": 0.0027, "num_tokens": 21848944.0, "reward": 0.7833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 630 }, { "completion_length": 1004.1666870117188, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 3440.0, "completions/mean_length": 1553.25, "completions/mean_terminated_length": 1095.45458984375, "completions/min_length": 449.0, "completions/min_terminated_length": 449.0, "epoch": 0.21404341926729986, "frac_reward_zero_std": 0.5, "grad_norm": 0.3620413541793823, "kl": NaN, "learning_rate": 3.999309868875086e-07, "loss": -0.0181, "num_tokens": 21870372.0, "reward": 1.1583333015441895, "reward_std": 0.25380438566207886, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 631 }, { "completion_length": 2100.7501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 3441.0, "completions/max_terminated_length": 3441.0, "completions/mean_length": 2100.75, "completions/mean_terminated_length": 2100.75, "completions/min_length": 830.0, "completions/min_terminated_length": 830.0, "epoch": 0.21438263229308005, "frac_reward_zero_std": 0.0, "grad_norm": 0.1738094985485077, "kl": 0.0, "learning_rate": 3.9975845410628017e-07, "loss": 0.0015, "num_tokens": 21907617.0, "reward": 1.2166666984558105, "reward_std": 0.10641199350357056, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 632 }, { "completion_length": 3179.416748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6277.0, "completions/mean_length": 3728.5, "completions/mean_terminated_length": 3468.45458984375, "completions/min_length": 1711.0, "completions/min_terminated_length": 1711.0, "epoch": 0.21472184531886024, "frac_reward_zero_std": 0.5, "grad_norm": 0.8372994661331177, "kl": NaN, "learning_rate": 3.9958592132505177e-07, "loss": -0.0076, "num_tokens": 21959930.0, "reward": 0.8750001788139343, "reward_std": 0.2524876296520233, "rewards/correctness_reward_func/mean": 0.5999999642372131, "rewards/correctness_reward_func/std": 0.36181363463401794, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 633 }, { "completion_length": 1077.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2167.0, "completions/max_terminated_length": 2167.0, "completions/mean_length": 1077.416748046875, "completions/mean_terminated_length": 1077.416748046875, "completions/min_length": 652.0, "completions/min_terminated_length": 652.0, "epoch": 0.21506105834464043, "frac_reward_zero_std": 1.0, "grad_norm": 2.7163471827407193e-07, "kl": 0.0, "learning_rate": 3.9941338854382327e-07, "loss": 0.0, "num_tokens": 21984481.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 634 }, { "completion_length": 1683.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3120.0, "completions/max_terminated_length": 3120.0, "completions/mean_length": 1683.0, "completions/mean_terminated_length": 1683.0, "completions/min_length": 1112.0, "completions/min_terminated_length": 1112.0, "epoch": 0.21540027137042062, "frac_reward_zero_std": 1.0, "grad_norm": 2.3120736614146153e-07, "kl": 0.0, "learning_rate": 3.992408557625949e-07, "loss": 0.0, "num_tokens": 22015363.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 635 }, { "completion_length": 1880.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4715.0, "completions/max_terminated_length": 4715.0, "completions/mean_length": 1880.0, "completions/mean_terminated_length": 1880.0, "completions/min_length": 657.0, "completions/min_terminated_length": 657.0, "epoch": 0.2157394843962008, "frac_reward_zero_std": 1.0, "grad_norm": 2.4236021545220865e-07, "kl": 0.0, "learning_rate": 3.9906832298136643e-07, "loss": 0.0, "num_tokens": 22054771.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 636 }, { "completion_length": 1170.9166870117188, "completions/clipped_ratio": 0.5, "completions/max_length": 6589.0, "completions/max_terminated_length": 4163.0, "completions/mean_length": 4465.4169921875, "completions/mean_terminated_length": 2341.83349609375, "completions/min_length": 1286.0, "completions/min_terminated_length": 1286.0, "epoch": 0.216078697421981, "frac_reward_zero_std": 0.5, "grad_norm": 0.13113607466220856, "kl": NaN, "learning_rate": 3.9889579020013804e-07, "loss": -0.0029, "num_tokens": 22081302.0, "reward": 0.6166666746139526, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.15000000596046448, "rewards/format_reward_func/std": 0.15666989982128143, "step": 637 }, { "completion_length": 1980.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 5188.0, "completions/max_terminated_length": 5188.0, "completions/mean_length": 1980.0, "completions/mean_terminated_length": 1980.0, "completions/min_length": 435.0, "completions/min_terminated_length": 435.0, "epoch": 0.21641791044776118, "frac_reward_zero_std": 0.5, "grad_norm": 0.6408343315124512, "kl": 0.0, "learning_rate": 3.9872325741890954e-07, "loss": -0.0118, "num_tokens": 22119900.0, "reward": 0.8833333849906921, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.5833333134651184, "rewards/correctness_reward_func/std": 0.5149286389350891, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 638 }, { "completion_length": 3710.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 6489.0, "completions/max_terminated_length": 6489.0, "completions/mean_length": 3710.666748046875, "completions/mean_terminated_length": 3710.666748046875, "completions/min_length": 2540.0, "completions/min_terminated_length": 2540.0, "epoch": 0.2167571234735414, "frac_reward_zero_std": 0.0, "grad_norm": 1.0129146575927734, "kl": 0.0, "learning_rate": 3.9855072463768114e-07, "loss": 0.016, "num_tokens": 22177430.0, "reward": 1.1000001430511475, "reward_std": 0.21493908762931824, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.2696799635887146, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 639 }, { "completion_length": 1299.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2228.0, "completions/max_terminated_length": 2228.0, "completions/mean_length": 1299.416748046875, "completions/mean_terminated_length": 1299.416748046875, "completions/min_length": 931.0, "completions/min_terminated_length": 931.0, "epoch": 0.21709633649932158, "frac_reward_zero_std": 0.5, "grad_norm": 0.09300534427165985, "kl": 0.0, "learning_rate": 3.983781918564527e-07, "loss": 0.0, "num_tokens": 22206049.0, "reward": 1.25, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 640 }, { "completion_length": 3040.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 6160.0, "completions/max_terminated_length": 6160.0, "completions/mean_length": 3040.08349609375, "completions/mean_terminated_length": 3040.08349609375, "completions/min_length": 627.0, "completions/min_terminated_length": 627.0, "epoch": 0.21743554952510177, "frac_reward_zero_std": 0.5, "grad_norm": 0.1536818891763687, "kl": 0.0, "learning_rate": 3.982056590752243e-07, "loss": 0.0004, "num_tokens": 22256750.0, "reward": 1.2666666507720947, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 641 }, { "completion_length": 2255.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 5187.0, "completions/max_terminated_length": 5187.0, "completions/mean_length": 2255.25, "completions/mean_terminated_length": 2255.25, "completions/min_length": 754.0, "completions/min_terminated_length": 754.0, "epoch": 0.21777476255088196, "frac_reward_zero_std": 0.5, "grad_norm": 0.8280144929885864, "kl": 0.0, "learning_rate": 3.980331262939958e-07, "loss": 0.0191, "num_tokens": 22293539.0, "reward": 0.6333333253860474, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.3333333432674408, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 642 }, { "completion_length": 1590.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 5887.0, "completions/max_terminated_length": 5887.0, "completions/mean_length": 1590.8333740234375, "completions/mean_terminated_length": 1590.8333740234375, "completions/min_length": 726.0, "completions/min_terminated_length": 726.0, "epoch": 0.21811397557666215, "frac_reward_zero_std": 0.5, "grad_norm": 0.10805870592594147, "kl": 0.0, "learning_rate": 3.978605935127674e-07, "loss": 0.0101, "num_tokens": 22323081.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 643 }, { "completion_length": 2101.8334350585938, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 6169.0, "completions/mean_length": 3749.08349609375, "completions/mean_terminated_length": 2802.444580078125, "completions/min_length": 743.0, "completions/min_terminated_length": 743.0, "epoch": 0.21845318860244234, "frac_reward_zero_std": 0.5, "grad_norm": 0.9981639385223389, "kl": NaN, "learning_rate": 3.97688060731539e-07, "loss": -0.0685, "num_tokens": 22360183.0, "reward": 0.7208334803581238, "reward_std": 0.2441396415233612, "rewards/correctness_reward_func/mean": 0.4833333194255829, "rewards/correctness_reward_func/std": 0.43029236793518066, "rewards/format_reward_func/mean": 0.23750001192092896, "rewards/format_reward_func/std": 0.11894422769546509, "step": 644 }, { "completion_length": 1810.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4908.0, "completions/max_terminated_length": 4908.0, "completions/mean_length": 1810.5833740234375, "completions/mean_terminated_length": 1810.5833740234375, "completions/min_length": 656.0, "completions/min_terminated_length": 656.0, "epoch": 0.21879240162822253, "frac_reward_zero_std": 0.0, "grad_norm": 0.16437654197216034, "kl": 0.0, "learning_rate": 3.975155279503105e-07, "loss": -0.006, "num_tokens": 22390598.0, "reward": 1.1666667461395264, "reward_std": 0.09559705853462219, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 645 }, { "completion_length": 1326.6667175292969, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 3824.0, "completions/mean_length": 2424.83349609375, "completions/mean_terminated_length": 1592.0, "completions/min_length": 742.0, "completions/min_terminated_length": 742.0, "epoch": 0.21913161465400272, "frac_reward_zero_std": 0.5, "grad_norm": 0.23768624663352966, "kl": NaN, "learning_rate": 3.973429951690821e-07, "loss": -0.0106, "num_tokens": 22418914.0, "reward": 0.6500000953674316, "reward_std": 0.0774596780538559, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.2500000298023224, "rewards/format_reward_func/std": 0.11677484214305878, "step": 646 }, { "completion_length": 951.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3743.0, "completions/max_terminated_length": 3743.0, "completions/mean_length": 951.5, "completions/mean_terminated_length": 951.5, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 0.2194708276797829, "frac_reward_zero_std": 0.0, "grad_norm": 0.10783212631940842, "kl": 0.0, "learning_rate": 3.9717046238785367e-07, "loss": -0.0026, "num_tokens": 22442500.0, "reward": 1.254166603088379, "reward_std": 0.08225837349891663, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 647 }, { "completion_length": 1173.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2590.0, "completions/max_terminated_length": 2590.0, "completions/mean_length": 1173.5, "completions/mean_terminated_length": 1173.5, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.2198100407055631, "frac_reward_zero_std": 1.0, "grad_norm": 1.3344381954993878e-07, "kl": 0.0, "learning_rate": 3.969979296066253e-07, "loss": 0.0, "num_tokens": 22468684.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 648 }, { "completion_length": 1362.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3917.0, "completions/max_terminated_length": 3917.0, "completions/mean_length": 1362.166748046875, "completions/mean_terminated_length": 1362.166748046875, "completions/min_length": 527.0, "completions/min_terminated_length": 527.0, "epoch": 0.22014925373134328, "frac_reward_zero_std": 0.5, "grad_norm": 0.06996418535709381, "kl": 0.0, "learning_rate": 3.968253968253968e-07, "loss": -0.0004, "num_tokens": 22499058.0, "reward": 1.1666667461395264, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 649 }, { "completion_length": 1067.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1865.0, "completions/max_terminated_length": 1865.0, "completions/mean_length": 1067.0, "completions/mean_terminated_length": 1067.0, "completions/min_length": 601.0, "completions/min_terminated_length": 601.0, "epoch": 0.22048846675712347, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.966528640441684e-07, "loss": 0.0, "num_tokens": 22521366.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 650 }, { "completion_length": 1641.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4213.0, "completions/max_terminated_length": 4213.0, "completions/mean_length": 1641.8333740234375, "completions/mean_terminated_length": 1641.8333740234375, "completions/min_length": 992.0, "completions/min_terminated_length": 992.0, "epoch": 0.22082767978290366, "frac_reward_zero_std": 0.5, "grad_norm": 0.08669435232877731, "kl": 0.0, "learning_rate": 3.9648033126293993e-07, "loss": 0.0011, "num_tokens": 22550260.0, "reward": 1.25, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 651 }, { "completion_length": 2429.0834350585938, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 4880.0, "completions/mean_length": 3527.25, "completions/mean_terminated_length": 2914.900146484375, "completions/min_length": 1257.0, "completions/min_terminated_length": 1257.0, "epoch": 0.22116689280868385, "frac_reward_zero_std": 0.5, "grad_norm": 0.5226991176605225, "kl": NaN, "learning_rate": 3.9630779848171154e-07, "loss": -0.0914, "num_tokens": 22593785.0, "reward": 0.9666668176651001, "reward_std": 0.32506412267684937, "rewards/correctness_reward_func/mean": 0.7166666984558105, "rewards/correctness_reward_func/std": 0.3459725081920624, "rewards/format_reward_func/mean": 0.2500000298023224, "rewards/format_reward_func/std": 0.11677484214305878, "step": 652 }, { "completion_length": 1602.3333740234375, "completions/clipped_ratio": 0.33333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4316.0, "completions/mean_length": 3798.666748046875, "completions/mean_terminated_length": 2403.5, "completions/min_length": 1219.0, "completions/min_terminated_length": 1219.0, "epoch": 0.22150610583446403, "frac_reward_zero_std": 0.5, "grad_norm": 0.16389794647693634, "kl": NaN, "learning_rate": 3.9613526570048304e-07, "loss": -0.019, "num_tokens": 22626027.0, "reward": 0.20000001788139343, "reward_std": 0.0774596780538559, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.14770980179309845, "step": 653 }, { "completion_length": 970.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 2010.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 970.8333740234375, "completions/mean_terminated_length": 970.8333740234375, "completions/min_length": 497.0, "completions/min_terminated_length": 497.0, "epoch": 0.22184531886024422, "frac_reward_zero_std": 0.5, "grad_norm": 0.08526045829057693, "kl": 0.0, "learning_rate": 3.9596273291925465e-07, "loss": -0.0016, "num_tokens": 22649347.0, "reward": 1.2333333492279053, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.9333333373069763, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 654 }, { "completion_length": 2165.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3378.0, "completions/max_terminated_length": 3378.0, "completions/mean_length": 2165.75, "completions/mean_terminated_length": 2165.75, "completions/min_length": 1147.0, "completions/min_terminated_length": 1147.0, "epoch": 0.2221845318860244, "frac_reward_zero_std": 0.5, "grad_norm": 0.11644528061151505, "kl": 0.0, "learning_rate": 3.957902001380262e-07, "loss": 0.0005, "num_tokens": 22690942.0, "reward": 1.2666666507720947, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 655 }, { "completion_length": 3226.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4712.0, "completions/max_terminated_length": 4712.0, "completions/mean_length": 3226.916748046875, "completions/mean_terminated_length": 3226.916748046875, "completions/min_length": 1360.0, "completions/min_terminated_length": 1360.0, "epoch": 0.2225237449118046, "frac_reward_zero_std": 0.5, "grad_norm": 0.16651302576065063, "kl": 0.0, "learning_rate": 3.9561766735679775e-07, "loss": -0.0028, "num_tokens": 22740867.0, "reward": 0.75, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.44999995827674866, "rewards/correctness_reward_func/std": 0.47577688097953796, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 656 }, { "completion_length": 2330.666748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6434.0, "completions/mean_length": 2879.75, "completions/mean_terminated_length": 2542.54541015625, "completions/min_length": 1151.0, "completions/min_terminated_length": 1151.0, "epoch": 0.22286295793758482, "frac_reward_zero_std": 0.0, "grad_norm": 0.2756352722644806, "kl": NaN, "learning_rate": 3.954451345755693e-07, "loss": -0.0489, "num_tokens": 22779851.0, "reward": 1.0750000476837158, "reward_std": 0.29088661074638367, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.2696799635887146, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 657 }, { "completion_length": 678.5833435058594, "completions/clipped_ratio": 0.41666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 4787.0, "completions/mean_length": 3424.0, "completions/mean_terminated_length": 1163.2857666015625, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "epoch": 0.223202170963365, "frac_reward_zero_std": 0.0, "grad_norm": 0.3502046465873718, "kl": NaN, "learning_rate": 3.952726017943409e-07, "loss": -0.0154, "num_tokens": 22801434.0, "reward": 0.6083333492279053, "reward_std": 0.1128770112991333, "rewards/correctness_reward_func/mean": 0.43333330750465393, "rewards/correctness_reward_func/std": 0.45792683959007263, "rewards/format_reward_func/mean": 0.17499999701976776, "rewards/format_reward_func/std": 0.15447859466075897, "step": 658 }, { "completion_length": 1686.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 5783.0, "completions/max_terminated_length": 5783.0, "completions/mean_length": 1686.75, "completions/mean_terminated_length": 1686.75, "completions/min_length": 722.0, "completions/min_terminated_length": 722.0, "epoch": 0.2235413839891452, "frac_reward_zero_std": 0.0, "grad_norm": 0.9250512719154358, "kl": 0.0, "learning_rate": 3.951000690131125e-07, "loss": 0.0408, "num_tokens": 22831161.0, "reward": 1.0833333730697632, "reward_std": 0.3129711151123047, "rewards/correctness_reward_func/mean": 0.7833333015441895, "rewards/correctness_reward_func/std": 0.37618499994277954, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 659 }, { "completion_length": 2479.5, "completions/clipped_ratio": 0.0, "completions/max_length": 5375.0, "completions/max_terminated_length": 5375.0, "completions/mean_length": 2479.5, "completions/mean_terminated_length": 2479.5, "completions/min_length": 796.0, "completions/min_terminated_length": 796.0, "epoch": 0.22388059701492538, "frac_reward_zero_std": 0.5, "grad_norm": 0.5752743482589722, "kl": 0.0, "learning_rate": 3.94927536231884e-07, "loss": 0.0219, "num_tokens": 22872837.0, "reward": 1.149999976158142, "reward_std": 0.17606817185878754, "rewards/correctness_reward_func/mean": 0.8499999642372131, "rewards/correctness_reward_func/std": 0.28444525599479675, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 660 }, { "completion_length": 3482.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5768.0, "completions/max_terminated_length": 5768.0, "completions/mean_length": 3482.916748046875, "completions/mean_terminated_length": 3482.916748046875, "completions/min_length": 842.0, "completions/min_terminated_length": 842.0, "epoch": 0.22421981004070557, "frac_reward_zero_std": 0.5, "grad_norm": 0.8260320425033569, "kl": 0.0, "learning_rate": 3.947550034506556e-07, "loss": -0.0385, "num_tokens": 22924898.0, "reward": 1.0166666507720947, "reward_std": 0.24013885855674744, "rewards/correctness_reward_func/mean": 0.7166666984558105, "rewards/correctness_reward_func/std": 0.4386618733406067, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 661 }, { "completion_length": 1033.9167175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 1684.0, "completions/max_terminated_length": 1684.0, "completions/mean_length": 1033.916748046875, "completions/mean_terminated_length": 1033.916748046875, "completions/min_length": 538.0, "completions/min_terminated_length": 538.0, "epoch": 0.22455902306648576, "frac_reward_zero_std": 0.5, "grad_norm": 0.404193252325058, "kl": 0.0, "learning_rate": 3.945824706694272e-07, "loss": -0.0029, "num_tokens": 22953091.0, "reward": 1.058333396911621, "reward_std": 0.2610875070095062, "rewards/correctness_reward_func/mean": 0.7833333015441895, "rewards/correctness_reward_func/std": 0.37618499994277954, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 662 }, { "completion_length": 1029.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 2507.0, "completions/max_terminated_length": 2507.0, "completions/mean_length": 1029.3333740234375, "completions/mean_terminated_length": 1029.3333740234375, "completions/min_length": 582.0, "completions/min_terminated_length": 582.0, "epoch": 0.22489823609226595, "frac_reward_zero_std": 1.0, "grad_norm": 1.0883605483513747e-07, "kl": 0.0, "learning_rate": 3.944099378881988e-07, "loss": 0.0, "num_tokens": 22976513.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 663 }, { "completion_length": 670.0, "completions/clipped_ratio": 0.0, "completions/max_length": 888.0, "completions/max_terminated_length": 888.0, "completions/mean_length": 670.0, "completions/mean_terminated_length": 670.0, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "epoch": 0.22523744911804613, "frac_reward_zero_std": 0.5, "grad_norm": 0.06571773439645767, "kl": 0.0, "learning_rate": 3.942374051069703e-07, "loss": -0.0005, "num_tokens": 22999475.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 664 }, { "completion_length": 2176.8333740234375, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 6299.0, "completions/mean_length": 3275.0, "completions/mean_terminated_length": 2612.199951171875, "completions/min_length": 997.0, "completions/min_terminated_length": 997.0, "epoch": 0.22557666214382632, "frac_reward_zero_std": 0.5, "grad_norm": 0.5015944838523865, "kl": NaN, "learning_rate": 3.940648723257419e-07, "loss": 0.0043, "num_tokens": 23037429.0, "reward": 0.8166667819023132, "reward_std": 0.30441200733184814, "rewards/correctness_reward_func/mean": 0.5666666626930237, "rewards/correctness_reward_func/std": 0.42497774958610535, "rewards/format_reward_func/mean": 0.2500000298023224, "rewards/format_reward_func/std": 0.11677484214305878, "step": 665 }, { "completion_length": 2563.0001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 5240.0, "completions/max_terminated_length": 5240.0, "completions/mean_length": 2563.0, "completions/mean_terminated_length": 2563.0, "completions/min_length": 923.0, "completions/min_terminated_length": 923.0, "epoch": 0.2259158751696065, "frac_reward_zero_std": 0.5, "grad_norm": 0.5822378396987915, "kl": 0.0, "learning_rate": 3.9389233954451344e-07, "loss": 0.026, "num_tokens": 23078463.0, "reward": 1.066666603088379, "reward_std": 0.20655912160873413, "rewards/correctness_reward_func/mean": 0.7666666507720947, "rewards/correctness_reward_func/std": 0.3700941801071167, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 666 }, { "completion_length": 2461.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4158.0, "completions/max_terminated_length": 4158.0, "completions/mean_length": 2461.33349609375, "completions/mean_terminated_length": 2461.33349609375, "completions/min_length": 1184.0, "completions/min_terminated_length": 1184.0, "epoch": 0.2262550881953867, "frac_reward_zero_std": 1.0, "grad_norm": 1.4431849137963582e-07, "kl": 0.0, "learning_rate": 3.9371980676328504e-07, "loss": 0.0, "num_tokens": 23119081.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 667 }, { "completion_length": 1863.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4872.0, "completions/max_terminated_length": 4872.0, "completions/mean_length": 1863.3333740234375, "completions/mean_terminated_length": 1863.3333740234375, "completions/min_length": 512.0, "completions/min_terminated_length": 512.0, "epoch": 0.2265943012211669, "frac_reward_zero_std": 0.0, "grad_norm": 0.14633503556251526, "kl": 0.0, "learning_rate": 3.9354727398205654e-07, "loss": 0.0029, "num_tokens": 23151605.0, "reward": 1.2166666984558105, "reward_std": 0.09246456623077393, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298573225736618, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 668 }, { "completion_length": 1864.8333740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4246.0, "completions/mean_length": 2413.916748046875, "completions/mean_terminated_length": 2034.3636474609375, "completions/min_length": 911.0, "completions/min_terminated_length": 911.0, "epoch": 0.22693351424694708, "frac_reward_zero_std": 0.0, "grad_norm": 0.5761399269104004, "kl": NaN, "learning_rate": 3.9337474120082815e-07, "loss": -0.0322, "num_tokens": 23187561.0, "reward": 0.8250000476837158, "reward_std": 0.27409863471984863, "rewards/correctness_reward_func/mean": 0.550000011920929, "rewards/correctness_reward_func/std": 0.4100997745990753, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 669 }, { "completion_length": 1130.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1708.0, "completions/max_terminated_length": 1708.0, "completions/mean_length": 1130.916748046875, "completions/mean_terminated_length": 1130.916748046875, "completions/min_length": 778.0, "completions/min_terminated_length": 778.0, "epoch": 0.22727272727272727, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.932022084195997e-07, "loss": 0.0, "num_tokens": 23216174.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 670 }, { "completion_length": 1128.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2095.0, "completions/max_terminated_length": 2095.0, "completions/mean_length": 1128.416748046875, "completions/mean_terminated_length": 1128.416748046875, "completions/min_length": 642.0, "completions/min_terminated_length": 642.0, "epoch": 0.22761194029850745, "frac_reward_zero_std": 0.5, "grad_norm": 0.09337574988603592, "kl": 0.0, "learning_rate": 3.9302967563837126e-07, "loss": 0.0024, "num_tokens": 23244121.0, "reward": 0.75, "reward_std": 0.05477222427725792, "rewards/correctness_reward_func/mean": 0.44999995827674866, "rewards/correctness_reward_func/std": 0.47577688097953796, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 671 }, { "completion_length": 1535.9166870117188, "completions/clipped_ratio": 0.5, "completions/max_length": 6589.0, "completions/max_terminated_length": 5545.0, "completions/mean_length": 4830.4169921875, "completions/mean_terminated_length": 3071.83349609375, "completions/min_length": 1840.0, "completions/min_terminated_length": 1840.0, "epoch": 0.22795115332428764, "frac_reward_zero_std": 0.5, "grad_norm": 0.6959670782089233, "kl": NaN, "learning_rate": 3.928571428571428e-07, "loss": -0.0068, "num_tokens": 23274990.0, "reward": 0.28333336114883423, "reward_std": 0.20655912160873413, "rewards/correctness_reward_func/mean": 0.13333334028720856, "rewards/correctness_reward_func/std": 0.31139957904815674, "rewards/format_reward_func/mean": 0.15000000596046448, "rewards/format_reward_func/std": 0.15666989982128143, "step": 672 }, { "completion_length": 1835.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3286.0, "completions/max_terminated_length": 3286.0, "completions/mean_length": 1835.8333740234375, "completions/mean_terminated_length": 1835.8333740234375, "completions/min_length": 767.0, "completions/min_terminated_length": 767.0, "epoch": 0.22829036635006783, "frac_reward_zero_std": 0.5, "grad_norm": 0.10714925825595856, "kl": 0.0, "learning_rate": 3.926846100759144e-07, "loss": 0.0022, "num_tokens": 23310532.0, "reward": 1.1666667461395264, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 673 }, { "completion_length": 1179.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 2333.0, "completions/max_terminated_length": 2333.0, "completions/mean_length": 1179.166748046875, "completions/mean_terminated_length": 1179.166748046875, "completions/min_length": 505.0, "completions/min_terminated_length": 505.0, "epoch": 0.22862957937584802, "frac_reward_zero_std": 1.0, "grad_norm": 2.31352046853317e-07, "kl": 0.0, "learning_rate": 3.92512077294686e-07, "loss": 0.0, "num_tokens": 23339454.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 674 }, { "completion_length": 2697.08349609375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5319.0, "completions/mean_length": 3246.166748046875, "completions/mean_terminated_length": 2942.272705078125, "completions/min_length": 974.0, "completions/min_terminated_length": 974.0, "epoch": 0.22896879240162823, "frac_reward_zero_std": 0.5, "grad_norm": 0.6637680530548096, "kl": NaN, "learning_rate": 3.923395445134575e-07, "loss": -0.0302, "num_tokens": 23380981.0, "reward": 0.8583332896232605, "reward_std": 0.22453653812408447, "rewards/correctness_reward_func/mean": 0.5833333134651184, "rewards/correctness_reward_func/std": 0.5149286389350891, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 675 }, { "completion_length": 1928.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3160.0, "completions/max_terminated_length": 3160.0, "completions/mean_length": 1928.8333740234375, "completions/mean_terminated_length": 1928.8333740234375, "completions/min_length": 953.0, "completions/min_terminated_length": 953.0, "epoch": 0.22930800542740842, "frac_reward_zero_std": 0.5, "grad_norm": 0.10171937942504883, "kl": 0.0, "learning_rate": 3.921670117322291e-07, "loss": -0.0015, "num_tokens": 23414105.0, "reward": 1.1666667461395264, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 676 }, { "completion_length": 547.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 844.0, "completions/max_terminated_length": 844.0, "completions/mean_length": 547.8333740234375, "completions/mean_terminated_length": 547.8333740234375, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "epoch": 0.2296472184531886, "frac_reward_zero_std": 0.5, "grad_norm": 0.059833355247974396, "kl": 0.0, "learning_rate": 3.919944789510007e-07, "loss": 0.0003, "num_tokens": 23431623.0, "reward": 1.133333444595337, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 677 }, { "completion_length": 1936.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 4324.0, "completions/max_terminated_length": 4324.0, "completions/mean_length": 1936.75, "completions/mean_terminated_length": 1936.75, "completions/min_length": 579.0, "completions/min_terminated_length": 579.0, "epoch": 0.2299864314789688, "frac_reward_zero_std": 0.5, "grad_norm": 0.4380330443382263, "kl": 0.0, "learning_rate": 3.918219461697723e-07, "loss": -0.0158, "num_tokens": 23468760.0, "reward": 1.133333444595337, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.28069180250167847, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 678 }, { "completion_length": 910.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1362.0, "completions/max_terminated_length": 1362.0, "completions/mean_length": 910.4166870117188, "completions/mean_terminated_length": 910.4166870117188, "completions/min_length": 503.0, "completions/min_terminated_length": 503.0, "epoch": 0.230325644504749, "frac_reward_zero_std": 1.0, "grad_norm": 1.0412031770101748e-07, "kl": 0.0, "learning_rate": 3.916494133885438e-07, "loss": 0.0, "num_tokens": 23489765.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 679 }, { "completion_length": 1419.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2880.0, "completions/max_terminated_length": 2880.0, "completions/mean_length": 1419.8333740234375, "completions/mean_terminated_length": 1419.8333740234375, "completions/min_length": 559.0, "completions/min_terminated_length": 559.0, "epoch": 0.23066485753052918, "frac_reward_zero_std": 0.5, "grad_norm": 0.11209948360919952, "kl": 0.0, "learning_rate": 3.914768806073154e-07, "loss": -0.0005, "num_tokens": 23520645.0, "reward": 1.1500000953674316, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.8500000834465027, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 680 }, { "completion_length": 2968.7501220703125, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 6533.0, "completions/mean_length": 4066.916748046875, "completions/mean_terminated_length": 3562.5, "completions/min_length": 2128.0, "completions/min_terminated_length": 2128.0, "epoch": 0.23100407055630937, "frac_reward_zero_std": 0.5, "grad_norm": 0.10447150468826294, "kl": NaN, "learning_rate": 3.9130434782608694e-07, "loss": -0.0173, "num_tokens": 23564442.0, "reward": 0.6625000834465027, "reward_std": 0.06274950504302979, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.09323723614215851, "step": 681 }, { "completion_length": 731.0833740234375, "completions/clipped_ratio": 0.33333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 1945.0, "completions/mean_length": 2927.416748046875, "completions/mean_terminated_length": 1096.625, "completions/min_length": 620.0, "completions/min_terminated_length": 620.0, "epoch": 0.23134328358208955, "frac_reward_zero_std": 0.5, "grad_norm": 0.145811066031456, "kl": NaN, "learning_rate": 3.911318150448585e-07, "loss": -0.0086, "num_tokens": 23589685.0, "reward": 0.699999988079071, "reward_std": 0.0774596780538559, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.14770980179309845, "step": 682 }, { "completion_length": 841.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1748.0, "completions/max_terminated_length": 1748.0, "completions/mean_length": 841.0, "completions/mean_terminated_length": 841.0, "completions/min_length": 505.0, "completions/min_terminated_length": 505.0, "epoch": 0.23168249660786974, "frac_reward_zero_std": 0.5, "grad_norm": 0.2751244008541107, "kl": 0.0, "learning_rate": 3.9095928226363005e-07, "loss": -0.0016, "num_tokens": 23610985.0, "reward": 0.9666668176651001, "reward_std": 0.20655910670757294, "rewards/correctness_reward_func/mean": 0.6666666865348816, "rewards/correctness_reward_func/std": 0.31139957904815674, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 683 }, { "completion_length": 2837.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5740.0, "completions/max_terminated_length": 5740.0, "completions/mean_length": 2837.666748046875, "completions/mean_terminated_length": 2837.666748046875, "completions/min_length": 1397.0, "completions/min_terminated_length": 1397.0, "epoch": 0.23202170963364993, "frac_reward_zero_std": 0.5, "grad_norm": 0.0925978422164917, "kl": 0.0, "learning_rate": 3.9078674948240165e-07, "loss": -0.0017, "num_tokens": 23660457.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 684 }, { "completion_length": 1592.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 4128.0, "completions/max_terminated_length": 4128.0, "completions/mean_length": 1592.0, "completions/mean_terminated_length": 1592.0, "completions/min_length": 664.0, "completions/min_terminated_length": 664.0, "epoch": 0.23236092265943012, "frac_reward_zero_std": 0.5, "grad_norm": 0.6901833415031433, "kl": 0.0, "learning_rate": 3.906142167011732e-07, "loss": -0.0094, "num_tokens": 23688075.0, "reward": 0.9666666388511658, "reward_std": 0.2581988573074341, "rewards/correctness_reward_func/mean": 0.6666666865348816, "rewards/correctness_reward_func/std": 0.4923659861087799, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 685 }, { "completion_length": 566.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 919.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 566.25, "completions/mean_terminated_length": 566.25, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 0.2327001356852103, "frac_reward_zero_std": 0.0, "grad_norm": 0.23003122210502625, "kl": 0.0, "learning_rate": 3.9044168391994476e-07, "loss": 0.0012, "num_tokens": 23707470.0, "reward": 1.1166666746139526, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.8166666030883789, "rewards/correctness_reward_func/std": 0.27579087018966675, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 686 }, { "completion_length": 1287.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2399.0, "completions/max_terminated_length": 2399.0, "completions/mean_length": 1287.666748046875, "completions/mean_terminated_length": 1287.666748046875, "completions/min_length": 639.0, "completions/min_terminated_length": 639.0, "epoch": 0.2330393487109905, "frac_reward_zero_std": 0.5, "grad_norm": 0.09936317056417465, "kl": 0.0, "learning_rate": 3.902691511387163e-07, "loss": 0.0, "num_tokens": 23736716.0, "reward": 1.1500000953674316, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.8500000834465027, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 687 }, { "completion_length": 1156.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 4769.0, "completions/max_terminated_length": 4769.0, "completions/mean_length": 1156.25, "completions/mean_terminated_length": 1156.25, "completions/min_length": 576.0, "completions/min_terminated_length": 576.0, "epoch": 0.23337856173677068, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.900966183574879e-07, "loss": 0.0, "num_tokens": 23758805.0, "reward": 0.30000001192092896, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 688 }, { "completion_length": 1892.5000610351562, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6275.0, "completions/mean_length": 2441.58349609375, "completions/mean_terminated_length": 2064.54541015625, "completions/min_length": 513.0, "completions/min_terminated_length": 513.0, "epoch": 0.23371777476255087, "frac_reward_zero_std": 0.0, "grad_norm": 0.6890511512756348, "kl": NaN, "learning_rate": 3.8992408557625947e-07, "loss": -0.0588, "num_tokens": 23796545.0, "reward": 1.0375001430511475, "reward_std": 0.2679903209209442, "rewards/correctness_reward_func/mean": 0.75, "rewards/correctness_reward_func/std": 0.36306774616241455, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 689 }, { "completion_length": 2770.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4142.0, "completions/max_terminated_length": 4142.0, "completions/mean_length": 2770.33349609375, "completions/mean_terminated_length": 2770.33349609375, "completions/min_length": 1178.0, "completions/min_terminated_length": 1178.0, "epoch": 0.23405698778833106, "frac_reward_zero_std": 0.0, "grad_norm": 0.734200656414032, "kl": 0.0, "learning_rate": 3.89751552795031e-07, "loss": -0.0003, "num_tokens": 23845119.0, "reward": 1.0166666507720947, "reward_std": 0.440767377614975, "rewards/correctness_reward_func/mean": 0.7166666984558105, "rewards/correctness_reward_func/std": 0.4386618733406067, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 690 }, { "completion_length": 1363.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1962.0, "completions/max_terminated_length": 1962.0, "completions/mean_length": 1363.5833740234375, "completions/mean_terminated_length": 1363.5833740234375, "completions/min_length": 693.0, "completions/min_terminated_length": 693.0, "epoch": 0.23439620081411125, "frac_reward_zero_std": 1.0, "grad_norm": 2.2079890982240613e-07, "kl": 0.0, "learning_rate": 3.8957902001380263e-07, "loss": 0.0, "num_tokens": 23873902.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 691 }, { "completion_length": 822.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1576.0, "completions/max_terminated_length": 1576.0, "completions/mean_length": 822.8333740234375, "completions/mean_terminated_length": 822.8333740234375, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "epoch": 0.23473541383989144, "frac_reward_zero_std": 0.0, "grad_norm": 0.10281941294670105, "kl": 0.0, "learning_rate": 3.894064872325742e-07, "loss": -0.0002, "num_tokens": 23895980.0, "reward": 1.1500000953674316, "reward_std": 0.09246459603309631, "rewards/correctness_reward_func/mean": 0.8500000834465027, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 692 }, { "completion_length": 699.5833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1049.0, "completions/max_terminated_length": 1049.0, "completions/mean_length": 699.5833740234375, "completions/mean_terminated_length": 699.5833740234375, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 0.23507462686567165, "frac_reward_zero_std": 0.5, "grad_norm": 0.28774434328079224, "kl": 0.0, "learning_rate": 3.8923395445134574e-07, "loss": -0.0007, "num_tokens": 23916567.0, "reward": 0.8833333849906921, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.5833333134651184, "rewards/correctness_reward_func/std": 0.5149286389350891, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 693 }, { "completion_length": 1013.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2550.0, "completions/max_terminated_length": 2550.0, "completions/mean_length": 1013.0, "completions/mean_terminated_length": 1013.0, "completions/min_length": 566.0, "completions/min_terminated_length": 566.0, "epoch": 0.23541383989145184, "frac_reward_zero_std": 0.5, "grad_norm": 0.07994963973760605, "kl": 0.0, "learning_rate": 3.890614216701173e-07, "loss": 0.0024, "num_tokens": 23944047.0, "reward": 1.2666666507720947, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 694 }, { "completion_length": 1904.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3562.0, "completions/max_terminated_length": 3562.0, "completions/mean_length": 1904.916748046875, "completions/mean_terminated_length": 1904.916748046875, "completions/min_length": 606.0, "completions/min_terminated_length": 606.0, "epoch": 0.23575305291723203, "frac_reward_zero_std": 0.5, "grad_norm": 0.09583219140768051, "kl": 0.0, "learning_rate": 3.888888888888889e-07, "loss": -0.0006, "num_tokens": 23977784.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 695 }, { "completion_length": 876.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1769.0, "completions/max_terminated_length": 1769.0, "completions/mean_length": 876.8333740234375, "completions/mean_terminated_length": 876.8333740234375, "completions/min_length": 360.0, "completions/min_terminated_length": 360.0, "epoch": 0.23609226594301222, "frac_reward_zero_std": 1.0, "grad_norm": 7.380832300896145e-08, "kl": 0.0, "learning_rate": 3.8871635610766045e-07, "loss": 0.0, "num_tokens": 24001866.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 696 }, { "completion_length": 1457.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3931.0, "completions/max_terminated_length": 3931.0, "completions/mean_length": 1457.5, "completions/mean_terminated_length": 1457.5, "completions/min_length": 695.0, "completions/min_terminated_length": 695.0, "epoch": 0.2364314789687924, "frac_reward_zero_std": 0.5, "grad_norm": 0.5751703381538391, "kl": 0.0, "learning_rate": 3.88543823326432e-07, "loss": -0.022, "num_tokens": 24030342.0, "reward": 0.9000000953674316, "reward_std": 0.21908903121948242, "rewards/correctness_reward_func/mean": 0.6000000238418579, "rewards/correctness_reward_func/std": 0.36181363463401794, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 697 }, { "completion_length": 1276.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 1866.0, "completions/max_terminated_length": 1866.0, "completions/mean_length": 1276.666748046875, "completions/mean_terminated_length": 1276.666748046875, "completions/min_length": 876.0, "completions/min_terminated_length": 876.0, "epoch": 0.2367706919945726, "frac_reward_zero_std": 0.0, "grad_norm": 0.5665131211280823, "kl": 0.0, "learning_rate": 3.8837129054520355e-07, "loss": -0.0076, "num_tokens": 24057146.0, "reward": 1.066666841506958, "reward_std": 0.3098386526107788, "rewards/correctness_reward_func/mean": 0.7666666507720947, "rewards/correctness_reward_func/std": 0.3700941801071167, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 698 }, { "completion_length": 1514.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3280.0, "completions/max_terminated_length": 3280.0, "completions/mean_length": 1514.3333740234375, "completions/mean_terminated_length": 1514.3333740234375, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.23710990502035278, "frac_reward_zero_std": 0.0, "grad_norm": 0.4253982901573181, "kl": 0.0, "learning_rate": 3.8819875776397516e-07, "loss": 0.0088, "num_tokens": 24085560.0, "reward": 0.7000000476837158, "reward_std": 0.3265986442565918, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 699 }, { "completion_length": 1934.5833740234375, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 5187.0, "completions/mean_length": 3032.75, "completions/mean_terminated_length": 2321.5, "completions/min_length": 951.0, "completions/min_terminated_length": 951.0, "epoch": 0.23744911804613297, "frac_reward_zero_std": 0.5, "grad_norm": 0.4350591003894806, "kl": NaN, "learning_rate": 3.880262249827467e-07, "loss": -0.0673, "num_tokens": 24118171.0, "reward": 0.9166667461395264, "reward_std": 0.2840188145637512, "rewards/correctness_reward_func/mean": 0.6666666865348816, "rewards/correctness_reward_func/std": 0.31139957904815674, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.11677484214305878, "step": 700 }, { "completion_length": 1958.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 4626.0, "completions/max_terminated_length": 4626.0, "completions/mean_length": 1958.0, "completions/mean_terminated_length": 1958.0, "completions/min_length": 797.0, "completions/min_terminated_length": 797.0, "epoch": 0.23778833107191316, "frac_reward_zero_std": 0.0, "grad_norm": 0.35070163011550903, "kl": 0.0, "learning_rate": 3.8785369220151826e-07, "loss": 0.0025, "num_tokens": 24153667.0, "reward": 1.1000001430511475, "reward_std": 0.23490385711193085, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.2696799635887146, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 701 }, { "completion_length": 1124.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2765.0, "completions/max_terminated_length": 2765.0, "completions/mean_length": 1124.5, "completions/mean_terminated_length": 1124.5, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "epoch": 0.23812754409769335, "frac_reward_zero_std": 0.5, "grad_norm": 0.11582597345113754, "kl": 0.0, "learning_rate": 3.876811594202898e-07, "loss": -0.0012, "num_tokens": 24179749.0, "reward": 1.1500000953674316, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.8500000834465027, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 702 }, { "completion_length": 1943.25, "completions/clipped_ratio": 0.0, "completions/max_length": 4035.0, "completions/max_terminated_length": 4035.0, "completions/mean_length": 1943.25, "completions/mean_terminated_length": 1943.25, "completions/min_length": 519.0, "completions/min_terminated_length": 519.0, "epoch": 0.23846675712347354, "frac_reward_zero_std": 0.5, "grad_norm": 0.11352552473545074, "kl": 0.0, "learning_rate": 3.875086266390614e-07, "loss": 0.0012, "num_tokens": 24213964.0, "reward": 1.1500000953674316, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.8500000834465027, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 703 }, { "completion_length": 1198.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2858.0, "completions/max_terminated_length": 2858.0, "completions/mean_length": 1198.25, "completions/mean_terminated_length": 1198.25, "completions/min_length": 496.0, "completions/min_terminated_length": 496.0, "epoch": 0.23880597014925373, "frac_reward_zero_std": 0.5, "grad_norm": 0.07122494280338287, "kl": 0.0, "learning_rate": 3.873360938578329e-07, "loss": 0.001, "num_tokens": 24244057.0, "reward": 1.2166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298573225736618, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 704 }, { "completion_length": 2193.666748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 3976.0, "completions/mean_length": 2742.75, "completions/mean_terminated_length": 2393.091064453125, "completions/min_length": 843.0, "completions/min_terminated_length": 843.0, "epoch": 0.23914518317503392, "frac_reward_zero_std": 0.5, "grad_norm": 0.09500417858362198, "kl": NaN, "learning_rate": 3.8716356107660453e-07, "loss": -0.0105, "num_tokens": 24282195.0, "reward": 0.6750000715255737, "reward_std": 0.06123724579811096, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 705 }, { "completion_length": 2630.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 5251.0, "completions/max_terminated_length": 5251.0, "completions/mean_length": 2630.58349609375, "completions/mean_terminated_length": 2630.58349609375, "completions/min_length": 829.0, "completions/min_terminated_length": 829.0, "epoch": 0.2394843962008141, "frac_reward_zero_std": 0.0, "grad_norm": 0.9753767848014832, "kl": 0.0, "learning_rate": 3.8699102829537613e-07, "loss": -0.0168, "num_tokens": 24327838.0, "reward": 1.1000001430511475, "reward_std": 0.3098386526107788, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.38138505816459656, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 706 }, { "completion_length": 1942.7500610351562, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4530.0, "completions/mean_length": 2491.83349609375, "completions/mean_terminated_length": 2119.36376953125, "completions/min_length": 513.0, "completions/min_terminated_length": 513.0, "epoch": 0.2398236092265943, "frac_reward_zero_std": 0.0, "grad_norm": 0.2561108469963074, "kl": NaN, "learning_rate": 3.868184955141477e-07, "loss": -0.0457, "num_tokens": 24366193.0, "reward": 1.0416667461395264, "reward_std": 0.2761763334274292, "rewards/correctness_reward_func/mean": 0.7666667103767395, "rewards/correctness_reward_func/std": 0.2534608840942383, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 707 }, { "completion_length": 1299.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2080.0, "completions/max_terminated_length": 2080.0, "completions/mean_length": 1299.0, "completions/mean_terminated_length": 1299.0, "completions/min_length": 559.0, "completions/min_terminated_length": 559.0, "epoch": 0.24016282225237448, "frac_reward_zero_std": 0.5, "grad_norm": 0.11090793460607529, "kl": 0.0, "learning_rate": 3.8664596273291924e-07, "loss": -0.0001, "num_tokens": 24393121.0, "reward": 0.7666666507720947, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 708 }, { "completion_length": 2367.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 5939.0, "completions/max_terminated_length": 5939.0, "completions/mean_length": 2367.75, "completions/mean_terminated_length": 2367.75, "completions/min_length": 1082.0, "completions/min_terminated_length": 1082.0, "epoch": 0.24050203527815467, "frac_reward_zero_std": 1.0, "grad_norm": 3.2568166830060363e-07, "kl": 0.0, "learning_rate": 3.864734299516908e-07, "loss": 0.0, "num_tokens": 24432862.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 709 }, { "completion_length": 1470.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2653.0, "completions/max_terminated_length": 2653.0, "completions/mean_length": 1470.666748046875, "completions/mean_terminated_length": 1470.666748046875, "completions/min_length": 611.0, "completions/min_terminated_length": 611.0, "epoch": 0.24084124830393486, "frac_reward_zero_std": 1.0, "grad_norm": 2.8426899234546e-07, "kl": 0.0, "learning_rate": 3.863008971704624e-07, "loss": 0.0, "num_tokens": 24462162.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 710 }, { "completion_length": 2114.2501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 6433.0, "completions/max_terminated_length": 6433.0, "completions/mean_length": 2114.25, "completions/mean_terminated_length": 2114.25, "completions/min_length": 487.0, "completions/min_terminated_length": 487.0, "epoch": 0.24118046132971507, "frac_reward_zero_std": 0.5, "grad_norm": 0.05685143172740936, "kl": 0.0, "learning_rate": 3.8612836438923395e-07, "loss": 0.0, "num_tokens": 24495963.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 711 }, { "completion_length": 929.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1821.0, "completions/max_terminated_length": 1821.0, "completions/mean_length": 929.1666870117188, "completions/mean_terminated_length": 929.1666870117188, "completions/min_length": 474.0, "completions/min_terminated_length": 474.0, "epoch": 0.24151967435549526, "frac_reward_zero_std": 0.5, "grad_norm": 0.06048990786075592, "kl": 0.0, "learning_rate": 3.859558316080055e-07, "loss": -0.0, "num_tokens": 24517901.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 712 }, { "completion_length": 1756.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3761.0, "completions/max_terminated_length": 3761.0, "completions/mean_length": 1756.3333740234375, "completions/mean_terminated_length": 1756.3333740234375, "completions/min_length": 840.0, "completions/min_terminated_length": 840.0, "epoch": 0.24185888738127545, "frac_reward_zero_std": 0.5, "grad_norm": 0.14410604536533356, "kl": 0.0, "learning_rate": 3.8578329882677706e-07, "loss": 0.0002, "num_tokens": 24553629.0, "reward": 1.1541666984558105, "reward_std": 0.07486096024513245, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 713 }, { "completion_length": 2858.5001220703125, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5566.0, "completions/mean_length": 3407.58349609375, "completions/mean_terminated_length": 3118.36376953125, "completions/min_length": 1465.0, "completions/min_terminated_length": 1465.0, "epoch": 0.24219810040705564, "frac_reward_zero_std": 0.0, "grad_norm": 0.2787613272666931, "kl": NaN, "learning_rate": 3.8561076604554866e-07, "loss": -0.0455, "num_tokens": 24603819.0, "reward": 1.0916666984558105, "reward_std": 0.26536136865615845, "rewards/correctness_reward_func/mean": 0.8166666030883789, "rewards/correctness_reward_func/std": 0.27579087018966675, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 714 }, { "completion_length": 2805.5, "completions/clipped_ratio": 0.0, "completions/max_length": 6465.0, "completions/max_terminated_length": 6465.0, "completions/mean_length": 2805.5, "completions/mean_terminated_length": 2805.5, "completions/min_length": 967.0, "completions/min_terminated_length": 967.0, "epoch": 0.24253731343283583, "frac_reward_zero_std": 0.5, "grad_norm": 0.48999252915382385, "kl": 0.0, "learning_rate": 3.8543823326432016e-07, "loss": 0.0095, "num_tokens": 24648591.0, "reward": 0.38333332538604736, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.0833333358168602, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 715 }, { "completion_length": 2720.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 5956.0, "completions/max_terminated_length": 5956.0, "completions/mean_length": 2720.166748046875, "completions/mean_terminated_length": 2720.166748046875, "completions/min_length": 774.0, "completions/min_terminated_length": 774.0, "epoch": 0.24287652645861602, "frac_reward_zero_std": 0.5, "grad_norm": 0.42055872082710266, "kl": 0.0, "learning_rate": 3.8526570048309177e-07, "loss": -0.0159, "num_tokens": 24692747.0, "reward": 0.6333333253860474, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.3333333432674408, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 716 }, { "completion_length": 1547.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2570.0, "completions/max_terminated_length": 2570.0, "completions/mean_length": 1547.416748046875, "completions/mean_terminated_length": 1547.416748046875, "completions/min_length": 846.0, "completions/min_terminated_length": 846.0, "epoch": 0.2432157394843962, "frac_reward_zero_std": 0.5, "grad_norm": 0.07790729403495789, "kl": 0.0, "learning_rate": 3.850931677018633e-07, "loss": -0.0014, "num_tokens": 24721240.0, "reward": 1.1666667461395264, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 717 }, { "completion_length": 1684.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 4692.0, "completions/max_terminated_length": 4692.0, "completions/mean_length": 1684.5, "completions/mean_terminated_length": 1684.5, "completions/min_length": 559.0, "completions/min_terminated_length": 559.0, "epoch": 0.2435549525101764, "frac_reward_zero_std": 0.5, "grad_norm": 0.09478975832462311, "kl": 0.0, "learning_rate": 3.8492063492063493e-07, "loss": -0.0023, "num_tokens": 24754684.0, "reward": 1.133333444595337, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 718 }, { "completion_length": 3755.0001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 6407.0, "completions/max_terminated_length": 6407.0, "completions/mean_length": 3755.0, "completions/mean_terminated_length": 3755.0, "completions/min_length": 2256.0, "completions/min_terminated_length": 2256.0, "epoch": 0.24389416553595658, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.8474810213940643e-07, "loss": 0.0, "num_tokens": 24810616.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 719 }, { "completion_length": 1573.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2848.0, "completions/max_terminated_length": 2848.0, "completions/mean_length": 1573.3333740234375, "completions/mean_terminated_length": 1573.3333740234375, "completions/min_length": 851.0, "completions/min_terminated_length": 851.0, "epoch": 0.24423337856173677, "frac_reward_zero_std": 0.5, "grad_norm": 0.08259549736976624, "kl": 0.0, "learning_rate": 3.8457556935817803e-07, "loss": -0.0009, "num_tokens": 24840470.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 720 }, { "completion_length": 1527.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4029.0, "completions/max_terminated_length": 4029.0, "completions/mean_length": 1527.416748046875, "completions/mean_terminated_length": 1527.416748046875, "completions/min_length": 735.0, "completions/min_terminated_length": 735.0, "epoch": 0.24457259158751696, "frac_reward_zero_std": 0.5, "grad_norm": 0.507327139377594, "kl": 0.0, "learning_rate": 3.8440303657694964e-07, "loss": -0.022, "num_tokens": 24872401.0, "reward": 1.0, "reward_std": 0.21908903121948242, "rewards/correctness_reward_func/mean": 0.699999988079071, "rewards/correctness_reward_func/std": 0.43064433336257935, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 721 }, { "completion_length": 2963.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4433.0, "completions/max_terminated_length": 4433.0, "completions/mean_length": 2963.916748046875, "completions/mean_terminated_length": 2963.916748046875, "completions/min_length": 1830.0, "completions/min_terminated_length": 1830.0, "epoch": 0.24491180461329715, "frac_reward_zero_std": 0.0, "grad_norm": 0.5084398984909058, "kl": 0.0, "learning_rate": 3.842305037957212e-07, "loss": 0.0097, "num_tokens": 24920676.0, "reward": 1.149999976158142, "reward_std": 0.2270146608352661, "rewards/correctness_reward_func/mean": 0.8499999642372131, "rewards/correctness_reward_func/std": 0.28444522619247437, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 722 }, { "completion_length": 2600.0833740234375, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 5634.0, "completions/mean_length": 4247.33349609375, "completions/mean_terminated_length": 3466.77783203125, "completions/min_length": 919.0, "completions/min_terminated_length": 919.0, "epoch": 0.24525101763907733, "frac_reward_zero_std": 0.0, "grad_norm": 1.0371900796890259, "kl": NaN, "learning_rate": 3.8405797101449274e-07, "loss": -0.0943, "num_tokens": 24960865.0, "reward": 0.7083333730697632, "reward_std": 0.4670211672782898, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.22500000894069672, "rewards/format_reward_func/std": 0.13568010926246643, "step": 723 }, { "completion_length": 526.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1250.0, "completions/max_terminated_length": 1250.0, "completions/mean_length": 526.3333740234375, "completions/mean_terminated_length": 526.3333740234375, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.24559023066485752, "frac_reward_zero_std": 1.0, "grad_norm": 7.004301494362153e-08, "kl": 0.0, "learning_rate": 3.838854382332643e-07, "loss": 0.0, "num_tokens": 24981371.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 724 }, { "completion_length": 1995.166748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4928.0, "completions/mean_length": 2544.25, "completions/mean_terminated_length": 2176.54541015625, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.2459294436906377, "frac_reward_zero_std": 0.5, "grad_norm": 0.07325784862041473, "kl": NaN, "learning_rate": 3.837129054520359e-07, "loss": -0.0124, "num_tokens": 25017247.0, "reward": 0.6750000715255737, "reward_std": 0.06123724579811096, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 725 }, { "completion_length": 1874.3334350585938, "completions/clipped_ratio": 0.0, "completions/max_length": 4024.0, "completions/max_terminated_length": 4024.0, "completions/mean_length": 1874.3333740234375, "completions/mean_terminated_length": 1874.3333740234375, "completions/min_length": 585.0, "completions/min_terminated_length": 585.0, "epoch": 0.2462686567164179, "frac_reward_zero_std": 1.0, "grad_norm": 1.9815544760604098e-07, "kl": 0.0, "learning_rate": 3.835403726708074e-07, "loss": 0.0, "num_tokens": 25055375.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 726 }, { "completion_length": 1643.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 5250.0, "completions/max_terminated_length": 5250.0, "completions/mean_length": 1643.5833740234375, "completions/mean_terminated_length": 1643.5833740234375, "completions/min_length": 555.0, "completions/min_terminated_length": 555.0, "epoch": 0.2466078697421981, "frac_reward_zero_std": 1.0, "grad_norm": 2.3413605276800808e-07, "kl": 0.0, "learning_rate": 3.83367839889579e-07, "loss": 0.0, "num_tokens": 25086828.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 727 }, { "completion_length": 2202.2500610351562, "completions/clipped_ratio": 0.5833333333333333, "completions/max_length": 6589.0, "completions/max_terminated_length": 6381.0, "completions/mean_length": 6045.83349609375, "completions/mean_terminated_length": 5285.39990234375, "completions/min_length": 3332.0, "completions/min_terminated_length": 3332.0, "epoch": 0.24694708276797828, "frac_reward_zero_std": 0.0, "grad_norm": 0.7284405827522278, "kl": NaN, "learning_rate": 3.8319530710835056e-07, "loss": -0.1411, "num_tokens": 25121163.0, "reward": 0.33750003576278687, "reward_std": 0.37498682737350464, "rewards/correctness_reward_func/mean": 0.20000000298023224, "rewards/correctness_reward_func/std": 0.36181363463401794, "rewards/format_reward_func/mean": 0.13750000298023224, "rewards/format_reward_func/std": 0.14943073689937592, "step": 728 }, { "completion_length": 1111.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1887.0, "completions/max_terminated_length": 1887.0, "completions/mean_length": 1111.3333740234375, "completions/mean_terminated_length": 1111.3333740234375, "completions/min_length": 764.0, "completions/min_terminated_length": 764.0, "epoch": 0.2472862957937585, "frac_reward_zero_std": 1.0, "grad_norm": 1.3690761591078626e-07, "kl": 0.0, "learning_rate": 3.8302277432712217e-07, "loss": 0.0, "num_tokens": 25146583.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 729 }, { "completion_length": 3647.250244140625, "completions/clipped_ratio": 0.0, "completions/max_length": 5731.0, "completions/max_terminated_length": 5731.0, "completions/mean_length": 3647.25, "completions/mean_terminated_length": 3647.25, "completions/min_length": 1535.0, "completions/min_terminated_length": 1535.0, "epoch": 0.24762550881953868, "frac_reward_zero_std": 0.0, "grad_norm": 0.6610546708106995, "kl": 0.0, "learning_rate": 3.8285024154589367e-07, "loss": -0.0118, "num_tokens": 25205158.0, "reward": 0.7833334803581238, "reward_std": 0.20412415266036987, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.43029239773750305, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 730 }, { "completion_length": 2499.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 6543.0, "completions/max_terminated_length": 6543.0, "completions/mean_length": 2499.666748046875, "completions/mean_terminated_length": 2499.666748046875, "completions/min_length": 608.0, "completions/min_terminated_length": 608.0, "epoch": 0.24796472184531887, "frac_reward_zero_std": 0.5, "grad_norm": 0.07455330342054367, "kl": 0.0, "learning_rate": 3.8267770876466527e-07, "loss": -0.0011, "num_tokens": 25245858.0, "reward": 0.7666666507720947, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 731 }, { "completion_length": 1155.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2734.0, "completions/max_terminated_length": 2734.0, "completions/mean_length": 1155.25, "completions/mean_terminated_length": 1155.25, "completions/min_length": 532.0, "completions/min_terminated_length": 532.0, "epoch": 0.24830393487109906, "frac_reward_zero_std": 0.5, "grad_norm": 0.08911774307489395, "kl": 0.0, "learning_rate": 3.825051759834368e-07, "loss": 0.0029, "num_tokens": 25270719.0, "reward": 0.7833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 732 }, { "completion_length": 1258.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1898.0, "completions/max_terminated_length": 1898.0, "completions/mean_length": 1258.8333740234375, "completions/mean_terminated_length": 1258.8333740234375, "completions/min_length": 658.0, "completions/min_terminated_length": 658.0, "epoch": 0.24864314789687925, "frac_reward_zero_std": 0.5, "grad_norm": 0.05099448561668396, "kl": 0.0, "learning_rate": 3.8233264320220843e-07, "loss": -0.0012, "num_tokens": 25292191.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 733 }, { "completion_length": 2122.7501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 2994.0, "completions/max_terminated_length": 2994.0, "completions/mean_length": 2122.75, "completions/mean_terminated_length": 2122.75, "completions/min_length": 971.0, "completions/min_terminated_length": 971.0, "epoch": 0.24898236092265943, "frac_reward_zero_std": 0.5, "grad_norm": 0.335657000541687, "kl": 0.0, "learning_rate": 3.8216011042097993e-07, "loss": 0.0078, "num_tokens": 25329844.0, "reward": 0.6333333253860474, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.3333333432674408, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 734 }, { "completion_length": 2080.7501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 5717.0, "completions/max_terminated_length": 5717.0, "completions/mean_length": 2080.75, "completions/mean_terminated_length": 2080.75, "completions/min_length": 529.0, "completions/min_terminated_length": 529.0, "epoch": 0.24932157394843962, "frac_reward_zero_std": 1.0, "grad_norm": 3.0757533409087046e-07, "kl": 0.0, "learning_rate": 3.8198757763975154e-07, "loss": 0.0, "num_tokens": 25368103.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 735 }, { "completion_length": 2086.5001220703125, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5375.0, "completions/mean_length": 2635.58349609375, "completions/mean_terminated_length": 2276.181884765625, "completions/min_length": 965.0, "completions/min_terminated_length": 965.0, "epoch": 0.2496607869742198, "frac_reward_zero_std": 0.0, "grad_norm": 0.5441375374794006, "kl": NaN, "learning_rate": 3.818150448585231e-07, "loss": 0.0135, "num_tokens": 25404553.0, "reward": 0.5416666865348816, "reward_std": 0.2677963674068451, "rewards/correctness_reward_func/mean": 0.2666666805744171, "rewards/correctness_reward_func/std": 0.393892765045166, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 736 }, { "completion_length": 1710.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3985.0, "completions/max_terminated_length": 3985.0, "completions/mean_length": 1710.916748046875, "completions/mean_terminated_length": 1710.916748046875, "completions/min_length": 476.0, "completions/min_terminated_length": 476.0, "epoch": 0.25, "frac_reward_zero_std": 0.5, "grad_norm": 0.43407583236694336, "kl": 0.0, "learning_rate": 3.8164251207729464e-07, "loss": 0.0034, "num_tokens": 25435602.0, "reward": 0.6333333253860474, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.3333333432674408, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 737 }, { "completion_length": 1790.5001220703125, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4435.0, "completions/mean_length": 2339.58349609375, "completions/mean_terminated_length": 1953.2728271484375, "completions/min_length": 725.0, "completions/min_terminated_length": 725.0, "epoch": 0.2503392130257802, "frac_reward_zero_std": 0.0, "grad_norm": 0.07025188952684402, "kl": NaN, "learning_rate": 3.8146997929606625e-07, "loss": -0.0104, "num_tokens": 25472916.0, "reward": 0.26250001788139343, "reward_std": 0.09185586869716644, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.09323723614215851, "step": 738 }, { "completion_length": 1810.4166870117188, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 5111.0, "completions/mean_length": 2908.58349609375, "completions/mean_terminated_length": 2172.5, "completions/min_length": 804.0, "completions/min_terminated_length": 804.0, "epoch": 0.2506784260515604, "frac_reward_zero_std": 0.5, "grad_norm": 0.8079109191894531, "kl": NaN, "learning_rate": 3.812974465148378e-07, "loss": -0.0606, "num_tokens": 25503497.0, "reward": 0.5166666507720947, "reward_std": 0.2840188145637512, "rewards/correctness_reward_func/mean": 0.2666666805744171, "rewards/correctness_reward_func/std": 0.393892765045166, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.11677484214305878, "step": 739 }, { "completion_length": 861.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1548.0, "completions/max_terminated_length": 1548.0, "completions/mean_length": 861.6666870117188, "completions/mean_terminated_length": 861.6666870117188, "completions/min_length": 534.0, "completions/min_terminated_length": 534.0, "epoch": 0.2510176390773406, "frac_reward_zero_std": 1.0, "grad_norm": 2.0568649006236228e-07, "kl": 0.0, "learning_rate": 3.811249137336094e-07, "loss": 0.0, "num_tokens": 25525681.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 740 }, { "completion_length": 2058.916748046875, "completions/clipped_ratio": 0.33333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6243.0, "completions/mean_length": 4255.25, "completions/mean_terminated_length": 3088.375, "completions/min_length": 1242.0, "completions/min_terminated_length": 1242.0, "epoch": 0.25135685210312075, "frac_reward_zero_std": 0.0, "grad_norm": 1.2331775426864624, "kl": NaN, "learning_rate": 3.809523809523809e-07, "loss": -0.1065, "num_tokens": 25565538.0, "reward": 0.6666666865348816, "reward_std": 0.4954916834831238, "rewards/correctness_reward_func/mean": 0.46666669845581055, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.14770980179309845, "step": 741 }, { "completion_length": 1583.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2655.0, "completions/max_terminated_length": 2655.0, "completions/mean_length": 1583.0, "completions/mean_terminated_length": 1583.0, "completions/min_length": 746.0, "completions/min_terminated_length": 746.0, "epoch": 0.25169606512890097, "frac_reward_zero_std": 0.5, "grad_norm": 0.3111180067062378, "kl": 0.0, "learning_rate": 3.807798481711525e-07, "loss": -0.0014, "num_tokens": 25600728.0, "reward": 1.133333444595337, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.2806917726993561, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 742 }, { "completion_length": 2449.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4081.0, "completions/max_terminated_length": 4081.0, "completions/mean_length": 2449.83349609375, "completions/mean_terminated_length": 2449.83349609375, "completions/min_length": 670.0, "completions/min_terminated_length": 670.0, "epoch": 0.25203527815468113, "frac_reward_zero_std": 1.0, "grad_norm": 2.9365247655732674e-07, "kl": 0.0, "learning_rate": 3.8060731538992407e-07, "loss": 0.0, "num_tokens": 25642138.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 743 }, { "completion_length": 1862.6666870117188, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5477.0, "completions/mean_length": 2411.75, "completions/mean_terminated_length": 2032.0, "completions/min_length": 1016.0, "completions/min_terminated_length": 1016.0, "epoch": 0.25237449118046135, "frac_reward_zero_std": 0.5, "grad_norm": 0.5083035826683044, "kl": NaN, "learning_rate": 3.8043478260869567e-07, "loss": -0.0015, "num_tokens": 25674756.0, "reward": 0.9416667819023132, "reward_std": 0.24983328580856323, "rewards/correctness_reward_func/mean": 0.6666666865348816, "rewards/correctness_reward_func/std": 0.31139957904815674, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 744 }, { "completion_length": 2217.5001220703125, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5314.0, "completions/mean_length": 2766.58349609375, "completions/mean_terminated_length": 2419.091064453125, "completions/min_length": 660.0, "completions/min_terminated_length": 660.0, "epoch": 0.2527137042062415, "frac_reward_zero_std": 0.5, "grad_norm": 0.5263506174087524, "kl": NaN, "learning_rate": 3.8026224982746717e-07, "loss": -0.0127, "num_tokens": 25710984.0, "reward": 0.8875001668930054, "reward_std": 0.23438750207424164, "rewards/correctness_reward_func/mean": 0.5999999642372131, "rewards/correctness_reward_func/std": 0.36181363463401794, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 745 }, { "completion_length": 2103.3334350585938, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4934.0, "completions/mean_length": 2652.416748046875, "completions/mean_terminated_length": 2294.54541015625, "completions/min_length": 779.0, "completions/min_terminated_length": 779.0, "epoch": 0.2530529172320217, "frac_reward_zero_std": 0.5, "grad_norm": 0.532516360282898, "kl": NaN, "learning_rate": 3.800897170462388e-07, "loss": -0.0576, "num_tokens": 25749766.0, "reward": 0.8083333969116211, "reward_std": 0.23327383399009705, "rewards/correctness_reward_func/mean": 0.5333333611488342, "rewards/correctness_reward_func/std": 0.3938928246498108, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 746 }, { "completion_length": 2240.8333740234375, "completions/clipped_ratio": 0.33333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6402.0, "completions/mean_length": 4437.1669921875, "completions/mean_terminated_length": 3361.25, "completions/min_length": 1404.0, "completions/min_terminated_length": 1404.0, "epoch": 0.2533921302578019, "frac_reward_zero_std": 0.0, "grad_norm": 0.7276754379272461, "kl": NaN, "learning_rate": 3.7991718426501033e-07, "loss": -0.0838, "num_tokens": 25790936.0, "reward": 0.6666666865348816, "reward_std": 0.4954916834831238, "rewards/correctness_reward_func/mean": 0.46666669845581055, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.14770980179309845, "step": 747 }, { "completion_length": 935.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1434.0, "completions/max_terminated_length": 1434.0, "completions/mean_length": 935.75, "completions/mean_terminated_length": 935.75, "completions/min_length": 518.0, "completions/min_terminated_length": 518.0, "epoch": 0.2537313432835821, "frac_reward_zero_std": 1.0, "grad_norm": 9.218993568538281e-08, "kl": 0.0, "learning_rate": 3.797446514837819e-07, "loss": 0.0, "num_tokens": 25813547.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 748 }, { "completion_length": 1297.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3254.0, "completions/max_terminated_length": 3254.0, "completions/mean_length": 1297.8333740234375, "completions/mean_terminated_length": 1297.8333740234375, "completions/min_length": 514.0, "completions/min_terminated_length": 514.0, "epoch": 0.25407055630936226, "frac_reward_zero_std": 0.5, "grad_norm": 0.04760192334651947, "kl": 0.0, "learning_rate": 3.7957211870255344e-07, "loss": 0.0017, "num_tokens": 25845291.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 749 }, { "completion_length": 3342.08349609375, "completions/clipped_ratio": 0.0, "completions/max_length": 5745.0, "completions/max_terminated_length": 5745.0, "completions/mean_length": 3342.08349609375, "completions/mean_terminated_length": 3342.08349609375, "completions/min_length": 986.0, "completions/min_terminated_length": 986.0, "epoch": 0.2544097693351425, "frac_reward_zero_std": 0.5, "grad_norm": 0.1283939778804779, "kl": 0.0, "learning_rate": 3.7939958592132504e-07, "loss": 0.0042, "num_tokens": 25900642.0, "reward": 0.7833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 750 }, { "completion_length": 921.5000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1709.0, "completions/max_terminated_length": 1709.0, "completions/mean_length": 921.5, "completions/mean_terminated_length": 921.5, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 0.25474898236092264, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.792270531400966e-07, "loss": 0.0, "num_tokens": 25921246.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 751 }, { "completion_length": 2476.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3929.0, "completions/max_terminated_length": 3929.0, "completions/mean_length": 2476.5, "completions/mean_terminated_length": 2476.5, "completions/min_length": 1234.0, "completions/min_terminated_length": 1234.0, "epoch": 0.25508819538670285, "frac_reward_zero_std": 0.0, "grad_norm": 0.14889118075370789, "kl": 0.0, "learning_rate": 3.7905452035886815e-07, "loss": -0.0032, "num_tokens": 25963354.0, "reward": 1.2333333492279053, "reward_std": 0.10327950119972229, "rewards/correctness_reward_func/mean": 0.9333333373069763, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 752 }, { "completion_length": 1880.3333740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4136.0, "completions/mean_length": 2429.416748046875, "completions/mean_terminated_length": 2051.272705078125, "completions/min_length": 939.0, "completions/min_terminated_length": 939.0, "epoch": 0.255427408412483, "frac_reward_zero_std": 0.0, "grad_norm": 0.7829989194869995, "kl": NaN, "learning_rate": 3.7888198757763975e-07, "loss": -0.0045, "num_tokens": 26000126.0, "reward": 0.9750000834465027, "reward_std": 0.36095842719078064, "rewards/correctness_reward_func/mean": 0.7000000476837158, "rewards/correctness_reward_func/std": 0.43064433336257935, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 753 }, { "completion_length": 1247.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 4548.0, "completions/max_terminated_length": 4548.0, "completions/mean_length": 1247.416748046875, "completions/mean_terminated_length": 1247.416748046875, "completions/min_length": 488.0, "completions/min_terminated_length": 488.0, "epoch": 0.25576662143826323, "frac_reward_zero_std": 1.0, "grad_norm": 2.129471994294363e-07, "kl": 0.0, "learning_rate": 3.787094547964113e-07, "loss": 0.0, "num_tokens": 26026249.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 754 }, { "completion_length": 1251.9166870117188, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 4579.0, "completions/mean_length": 2350.08349609375, "completions/mean_terminated_length": 1502.300048828125, "completions/min_length": 654.0, "completions/min_terminated_length": 654.0, "epoch": 0.25610583446404345, "frac_reward_zero_std": 0.5, "grad_norm": 0.3859163224697113, "kl": NaN, "learning_rate": 3.785369220151829e-07, "loss": -0.0551, "num_tokens": 26054394.0, "reward": 1.0833332538604736, "reward_std": 0.3356585204601288, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.3892494738101959, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.11677484214305878, "step": 755 }, { "completion_length": 1971.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 5072.0, "completions/max_terminated_length": 5072.0, "completions/mean_length": 1971.916748046875, "completions/mean_terminated_length": 1971.916748046875, "completions/min_length": 738.0, "completions/min_terminated_length": 738.0, "epoch": 0.2564450474898236, "frac_reward_zero_std": 0.5, "grad_norm": 0.13238492608070374, "kl": 0.0, "learning_rate": 3.783643892339544e-07, "loss": -0.0047, "num_tokens": 26091593.0, "reward": 0.7333333492279053, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.43333330750465393, "rewards/correctness_reward_func/std": 0.45792683959007263, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 756 }, { "completion_length": 1132.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2217.0, "completions/max_terminated_length": 2217.0, "completions/mean_length": 1132.166748046875, "completions/mean_terminated_length": 1132.166748046875, "completions/min_length": 467.0, "completions/min_terminated_length": 467.0, "epoch": 0.2567842605156038, "frac_reward_zero_std": 0.5, "grad_norm": 0.04667339101433754, "kl": 0.0, "learning_rate": 3.78191856452726e-07, "loss": -0.0002, "num_tokens": 26114833.0, "reward": 0.7166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 757 }, { "completion_length": 3478.0833740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5985.0, "completions/mean_length": 4027.166748046875, "completions/mean_terminated_length": 3794.27294921875, "completions/min_length": 2198.0, "completions/min_terminated_length": 2198.0, "epoch": 0.257123473541384, "frac_reward_zero_std": 0.5, "grad_norm": 0.39453667402267456, "kl": NaN, "learning_rate": 3.7801932367149757e-07, "loss": -0.0182, "num_tokens": 26167190.0, "reward": 0.7416667938232422, "reward_std": 0.1855173110961914, "rewards/correctness_reward_func/mean": 0.46666669845581055, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 758 }, { "completion_length": 1258.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 2983.0, "completions/max_terminated_length": 2983.0, "completions/mean_length": 1258.3333740234375, "completions/mean_terminated_length": 1258.3333740234375, "completions/min_length": 442.0, "completions/min_terminated_length": 442.0, "epoch": 0.2574626865671642, "frac_reward_zero_std": 1.0, "grad_norm": 1.669351519240081e-07, "kl": 0.0, "learning_rate": 3.778467908902691e-07, "loss": 0.0, "num_tokens": 26195550.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 759 }, { "completion_length": 708.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1381.0, "completions/max_terminated_length": 1381.0, "completions/mean_length": 708.9166870117188, "completions/mean_terminated_length": 708.9166870117188, "completions/min_length": 423.0, "completions/min_terminated_length": 423.0, "epoch": 0.25780189959294436, "frac_reward_zero_std": 0.5, "grad_norm": 0.04684539511799812, "kl": 0.0, "learning_rate": 3.776742581090407e-07, "loss": -0.0, "num_tokens": 26213207.0, "reward": 0.7875000834465027, "reward_std": 0.03061862848699093, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 760 }, { "completion_length": 869.0000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 2138.0, "completions/max_terminated_length": 2138.0, "completions/mean_length": 869.0, "completions/mean_terminated_length": 869.0, "completions/min_length": 444.0, "completions/min_terminated_length": 444.0, "epoch": 0.2581411126187246, "frac_reward_zero_std": 0.5, "grad_norm": 0.07464686036109924, "kl": 0.0, "learning_rate": 3.775017253278123e-07, "loss": 0.0001, "num_tokens": 26235623.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 761 }, { "completion_length": 1791.666748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5793.0, "completions/mean_length": 2340.75, "completions/mean_terminated_length": 1954.5455322265625, "completions/min_length": 737.0, "completions/min_terminated_length": 737.0, "epoch": 0.25848032564450474, "frac_reward_zero_std": 0.0, "grad_norm": 0.49990057945251465, "kl": NaN, "learning_rate": 3.7732919254658383e-07, "loss": -0.0433, "num_tokens": 26269273.0, "reward": 1.0250000953674316, "reward_std": 0.2906580865383148, "rewards/correctness_reward_func/mean": 0.75, "rewards/correctness_reward_func/std": 0.36306774616241455, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 762 }, { "completion_length": 1257.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 2279.0, "completions/max_terminated_length": 2279.0, "completions/mean_length": 1257.666748046875, "completions/mean_terminated_length": 1257.666748046875, "completions/min_length": 546.0, "completions/min_terminated_length": 546.0, "epoch": 0.25881953867028495, "frac_reward_zero_std": 1.0, "grad_norm": 9.947257240128238e-08, "kl": 0.0, "learning_rate": 3.771566597653554e-07, "loss": 0.0, "num_tokens": 26291937.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 763 }, { "completion_length": 2138.5, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 4321.0, "completions/mean_length": 3785.75, "completions/mean_terminated_length": 2851.333251953125, "completions/min_length": 1372.0, "completions/min_terminated_length": 1372.0, "epoch": 0.2591587516960651, "frac_reward_zero_std": 0.0, "grad_norm": 1.0325132608413696, "kl": NaN, "learning_rate": 3.7698412698412694e-07, "loss": -0.0504, "num_tokens": 26331945.0, "reward": 0.7416666746139526, "reward_std": 0.4976527690887451, "rewards/correctness_reward_func/mean": 0.5166666507720947, "rewards/correctness_reward_func/std": 0.4628632962703705, "rewards/format_reward_func/mean": 0.22500000894069672, "rewards/format_reward_func/std": 0.13568010926246643, "step": 764 }, { "completion_length": 847.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1307.0, "completions/max_terminated_length": 1307.0, "completions/mean_length": 847.9166870117188, "completions/mean_terminated_length": 847.9166870117188, "completions/min_length": 562.0, "completions/min_terminated_length": 562.0, "epoch": 0.25949796472184533, "frac_reward_zero_std": 1.0, "grad_norm": 1.0442206388461273e-07, "kl": 0.0, "learning_rate": 3.7681159420289855e-07, "loss": 0.0, "num_tokens": 26353748.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 765 }, { "completion_length": 719.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1199.0, "completions/max_terminated_length": 1199.0, "completions/mean_length": 719.1666870117188, "completions/mean_terminated_length": 719.1666870117188, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.2598371777476255, "frac_reward_zero_std": 0.5, "grad_norm": 0.2788970172405243, "kl": 0.0, "learning_rate": 3.766390614216701e-07, "loss": -0.0009, "num_tokens": 26376988.0, "reward": 0.6708333492279053, "reward_std": 0.22383961081504822, "rewards/correctness_reward_func/mean": 0.38333332538604736, "rewards/correctness_reward_func/std": 0.4783177673816681, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 766 }, { "completion_length": 1171.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 1726.0, "completions/max_terminated_length": 1726.0, "completions/mean_length": 1171.916748046875, "completions/mean_terminated_length": 1171.916748046875, "completions/min_length": 713.0, "completions/min_terminated_length": 713.0, "epoch": 0.2601763907734057, "frac_reward_zero_std": 1.0, "grad_norm": 2.0947449286268238e-07, "kl": 0.0, "learning_rate": 3.7646652864044165e-07, "loss": 0.0, "num_tokens": 26401281.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 767 }, { "completion_length": 2836.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5037.0, "completions/max_terminated_length": 5037.0, "completions/mean_length": 2836.166748046875, "completions/mean_terminated_length": 2836.166748046875, "completions/min_length": 1051.0, "completions/min_terminated_length": 1051.0, "epoch": 0.26051560379918587, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.7629399585921326e-07, "loss": 0.0, "num_tokens": 26448257.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 768 }, { "completion_length": 1207.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2167.0, "completions/max_terminated_length": 2167.0, "completions/mean_length": 1207.416748046875, "completions/mean_terminated_length": 1207.416748046875, "completions/min_length": 774.0, "completions/min_terminated_length": 774.0, "epoch": 0.2608548168249661, "frac_reward_zero_std": 0.5, "grad_norm": 0.06325611472129822, "kl": 0.0, "learning_rate": 3.761214630779848e-07, "loss": 0.0005, "num_tokens": 26474326.0, "reward": 1.2166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298573225736618, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 769 }, { "completion_length": 2223.1666870117188, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5250.0, "completions/mean_length": 2772.25, "completions/mean_terminated_length": 2425.272705078125, "completions/min_length": 816.0, "completions/min_terminated_length": 816.0, "epoch": 0.26119402985074625, "frac_reward_zero_std": 0.5, "grad_norm": 0.499055415391922, "kl": NaN, "learning_rate": 3.7594893029675636e-07, "loss": -0.0143, "num_tokens": 26511738.0, "reward": 0.9083334803581238, "reward_std": 0.2905454635620117, "rewards/correctness_reward_func/mean": 0.6333333253860474, "rewards/correctness_reward_func/std": 0.3892494738101959, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 770 }, { "completion_length": 2154.8334350585938, "completions/clipped_ratio": 0.0, "completions/max_length": 5527.0, "completions/max_terminated_length": 5527.0, "completions/mean_length": 2154.83349609375, "completions/mean_terminated_length": 2154.83349609375, "completions/min_length": 946.0, "completions/min_terminated_length": 946.0, "epoch": 0.26153324287652646, "frac_reward_zero_std": 0.0, "grad_norm": 0.16773566603660583, "kl": 0.0, "learning_rate": 3.757763975155279e-07, "loss": 0.0018, "num_tokens": 26548642.0, "reward": 1.183333396911621, "reward_std": 0.09246458858251572, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 771 }, { "completion_length": 502.16668701171875, "completions/clipped_ratio": 0.0, "completions/max_length": 850.0, "completions/max_terminated_length": 850.0, "completions/mean_length": 502.16668701171875, "completions/mean_terminated_length": 502.16668701171875, "completions/min_length": 353.0, "completions/min_terminated_length": 353.0, "epoch": 0.2618724559023066, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.756038647342995e-07, "loss": 0.0, "num_tokens": 26566974.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 772 }, { "completion_length": 684.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1408.0, "completions/max_terminated_length": 1408.0, "completions/mean_length": 684.4166870117188, "completions/mean_terminated_length": 684.4166870117188, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "epoch": 0.26221166892808684, "frac_reward_zero_std": 0.5, "grad_norm": 0.07578389346599579, "kl": 0.0, "learning_rate": 3.754313319530711e-07, "loss": -0.0004, "num_tokens": 26585555.0, "reward": 1.25, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 773 }, { "completion_length": 1621.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3594.0, "completions/max_terminated_length": 3594.0, "completions/mean_length": 1621.8333740234375, "completions/mean_terminated_length": 1621.8333740234375, "completions/min_length": 887.0, "completions/min_terminated_length": 887.0, "epoch": 0.26255088195386705, "frac_reward_zero_std": 1.0, "grad_norm": 2.3733241505397018e-07, "kl": 0.0, "learning_rate": 3.7525879917184263e-07, "loss": 0.0, "num_tokens": 26615775.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 774 }, { "completion_length": 1442.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4935.0, "completions/max_terminated_length": 4935.0, "completions/mean_length": 1442.166748046875, "completions/mean_terminated_length": 1442.166748046875, "completions/min_length": 592.0, "completions/min_terminated_length": 592.0, "epoch": 0.2628900949796472, "frac_reward_zero_std": 0.5, "grad_norm": 0.1121632531285286, "kl": 0.0, "learning_rate": 3.750862663906142e-07, "loss": -0.0074, "num_tokens": 26647673.0, "reward": 0.7166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 775 }, { "completion_length": 520.5000152587891, "completions/clipped_ratio": 0.0, "completions/max_length": 897.0, "completions/max_terminated_length": 897.0, "completions/mean_length": 520.5, "completions/mean_terminated_length": 520.5, "completions/min_length": 358.0, "completions/min_terminated_length": 358.0, "epoch": 0.26322930800542743, "frac_reward_zero_std": 0.5, "grad_norm": 0.2791745960712433, "kl": 0.0, "learning_rate": 3.749137336093858e-07, "loss": -0.0005, "num_tokens": 26665733.0, "reward": 1.1041667461395264, "reward_std": 0.2002602219581604, "rewards/correctness_reward_func/mean": 0.8166666030883789, "rewards/correctness_reward_func/std": 0.27579087018966675, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 776 }, { "completion_length": 702.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1141.0, "completions/max_terminated_length": 1141.0, "completions/mean_length": 702.0, "completions/mean_terminated_length": 702.0, "completions/min_length": 494.0, "completions/min_terminated_length": 494.0, "epoch": 0.2635685210312076, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.7474120082815734e-07, "loss": 0.0, "num_tokens": 26684873.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 777 }, { "completion_length": 1498.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 1975.0, "completions/max_terminated_length": 1975.0, "completions/mean_length": 1498.75, "completions/mean_terminated_length": 1498.75, "completions/min_length": 1073.0, "completions/min_terminated_length": 1073.0, "epoch": 0.2639077340569878, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.745686680469289e-07, "loss": 0.0, "num_tokens": 26715134.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 778 }, { "completion_length": 1654.166748046875, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 5809.0, "completions/mean_length": 3301.416748046875, "completions/mean_terminated_length": 2205.5556640625, "completions/min_length": 488.0, "completions/min_terminated_length": 488.0, "epoch": 0.26424694708276797, "frac_reward_zero_std": 0.5, "grad_norm": 0.9202248454093933, "kl": NaN, "learning_rate": 3.7439613526570044e-07, "loss": -0.0862, "num_tokens": 26745628.0, "reward": 0.8583333492279053, "reward_std": 0.27095508575439453, "rewards/correctness_reward_func/mean": 0.6333333253860474, "rewards/correctness_reward_func/std": 0.4735424220561981, "rewards/format_reward_func/mean": 0.22500000894069672, "rewards/format_reward_func/std": 0.13568010926246643, "step": 779 }, { "completion_length": 1470.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3567.0, "completions/max_terminated_length": 3567.0, "completions/mean_length": 1470.75, "completions/mean_terminated_length": 1470.75, "completions/min_length": 360.0, "completions/min_terminated_length": 360.0, "epoch": 0.2645861601085482, "frac_reward_zero_std": 0.5, "grad_norm": 0.10494286566972733, "kl": 0.0, "learning_rate": 3.7422360248447205e-07, "loss": -0.0021, "num_tokens": 26774809.0, "reward": 1.2666666507720947, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 780 }, { "completion_length": 2154.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4803.0, "completions/max_terminated_length": 4803.0, "completions/mean_length": 2154.166748046875, "completions/mean_terminated_length": 2154.166748046875, "completions/min_length": 835.0, "completions/min_terminated_length": 835.0, "epoch": 0.26492537313432835, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.740510697032436e-07, "loss": 0.0, "num_tokens": 26810025.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 781 }, { "completion_length": 1063.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 1335.0, "completions/max_terminated_length": 1335.0, "completions/mean_length": 1063.25, "completions/mean_terminated_length": 1063.25, "completions/min_length": 722.0, "completions/min_terminated_length": 722.0, "epoch": 0.26526458616010856, "frac_reward_zero_std": 0.5, "grad_norm": 0.11350041627883911, "kl": 0.0, "learning_rate": 3.7387853692201516e-07, "loss": -0.0009, "num_tokens": 26833140.0, "reward": 1.25, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 782 }, { "completion_length": 1292.5833740234375, "completions/clipped_ratio": 0.33333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6559.0, "completions/mean_length": 3488.916748046875, "completions/mean_terminated_length": 1938.875, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "epoch": 0.2656037991858887, "frac_reward_zero_std": 0.5, "grad_norm": 1.5954703092575073, "kl": NaN, "learning_rate": 3.737060041407867e-07, "loss": -0.1164, "num_tokens": 26860345.0, "reward": 0.7333334684371948, "reward_std": 0.2840188145637512, "rewards/correctness_reward_func/mean": 0.5333333611488342, "rewards/correctness_reward_func/std": 0.3938928246498108, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.14770980179309845, "step": 783 }, { "completion_length": 2248.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 4318.0, "completions/max_terminated_length": 4318.0, "completions/mean_length": 2248.916748046875, "completions/mean_terminated_length": 2248.916748046875, "completions/min_length": 808.0, "completions/min_terminated_length": 808.0, "epoch": 0.26594301221166894, "frac_reward_zero_std": 0.5, "grad_norm": 0.35711029171943665, "kl": 0.0, "learning_rate": 3.735334713595583e-07, "loss": -0.0026, "num_tokens": 26896800.0, "reward": 0.7666667699813843, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.46666669845581055, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 784 }, { "completion_length": 1661.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2417.0, "completions/max_terminated_length": 2417.0, "completions/mean_length": 1661.5833740234375, "completions/mean_terminated_length": 1661.5833740234375, "completions/min_length": 810.0, "completions/min_terminated_length": 810.0, "epoch": 0.2662822252374491, "frac_reward_zero_std": 0.5, "grad_norm": 0.15416787564754486, "kl": 0.0, "learning_rate": 3.7336093857832987e-07, "loss": -0.0007, "num_tokens": 26927623.0, "reward": 1.133333444595337, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 785 }, { "completion_length": 779.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1260.0, "completions/max_terminated_length": 1260.0, "completions/mean_length": 779.75, "completions/mean_terminated_length": 779.75, "completions/min_length": 518.0, "completions/min_terminated_length": 518.0, "epoch": 0.2666214382632293, "frac_reward_zero_std": 0.5, "grad_norm": 0.07597201317548752, "kl": 0.0, "learning_rate": 3.731884057971014e-07, "loss": 0.0004, "num_tokens": 26947360.0, "reward": 1.133333444595337, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 786 }, { "completion_length": 2350.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4980.0, "completions/max_terminated_length": 4980.0, "completions/mean_length": 2350.83349609375, "completions/mean_terminated_length": 2350.83349609375, "completions/min_length": 1081.0, "completions/min_terminated_length": 1081.0, "epoch": 0.2669606512890095, "frac_reward_zero_std": 0.0, "grad_norm": 0.6928236484527588, "kl": 0.0, "learning_rate": 3.73015873015873e-07, "loss": 0.0143, "num_tokens": 26985722.0, "reward": 1.070833444595337, "reward_std": 0.2486901879310608, "rewards/correctness_reward_func/mean": 0.7833333015441895, "rewards/correctness_reward_func/std": 0.26227444410324097, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 787 }, { "completion_length": 1690.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 4178.0, "completions/max_terminated_length": 4178.0, "completions/mean_length": 1690.166748046875, "completions/mean_terminated_length": 1690.166748046875, "completions/min_length": 448.0, "completions/min_terminated_length": 448.0, "epoch": 0.2672998643147897, "frac_reward_zero_std": 0.5, "grad_norm": 0.48641929030418396, "kl": 0.0, "learning_rate": 3.728433402346446e-07, "loss": 0.0025, "num_tokens": 27017704.0, "reward": 0.4333333671092987, "reward_std": 0.20655912160873413, "rewards/correctness_reward_func/mean": 0.13333334028720856, "rewards/correctness_reward_func/std": 0.3113996088504791, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 788 }, { "completion_length": 721.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1439.0, "completions/max_terminated_length": 1439.0, "completions/mean_length": 721.75, "completions/mean_terminated_length": 721.75, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.26763907734056985, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.7267080745341613e-07, "loss": 0.0, "num_tokens": 27044245.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 789 }, { "completion_length": 3101.166748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6048.0, "completions/mean_length": 3650.25, "completions/mean_terminated_length": 3383.091064453125, "completions/min_length": 2335.0, "completions/min_terminated_length": 2335.0, "epoch": 0.26797829036635007, "frac_reward_zero_std": 0.0, "grad_norm": 0.937369704246521, "kl": NaN, "learning_rate": 3.724982746721877e-07, "loss": -0.0396, "num_tokens": 27088401.0, "reward": 0.7041667699813843, "reward_std": 0.46168631315231323, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 790 }, { "completion_length": 2644.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 6416.0, "completions/max_terminated_length": 6416.0, "completions/mean_length": 2644.08349609375, "completions/mean_terminated_length": 2644.08349609375, "completions/min_length": 737.0, "completions/min_terminated_length": 737.0, "epoch": 0.2683175033921303, "frac_reward_zero_std": 0.5, "grad_norm": 0.6890131235122681, "kl": 0.0, "learning_rate": 3.723257418909593e-07, "loss": -0.0096, "num_tokens": 27136918.0, "reward": 0.38333332538604736, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.0833333358168602, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 791 }, { "completion_length": 1909.0834350585938, "completions/clipped_ratio": 0.0, "completions/max_length": 3999.0, "completions/max_terminated_length": 3999.0, "completions/mean_length": 1909.0833740234375, "completions/mean_terminated_length": 1909.0833740234375, "completions/min_length": 454.0, "completions/min_terminated_length": 454.0, "epoch": 0.26865671641791045, "frac_reward_zero_std": 0.5, "grad_norm": 0.6697491407394409, "kl": 0.0, "learning_rate": 3.7215320910973084e-07, "loss": 0.0026, "num_tokens": 27169673.0, "reward": 1.0333333015441895, "reward_std": 0.25819891691207886, "rewards/correctness_reward_func/mean": 0.7333333492279053, "rewards/correctness_reward_func/std": 0.44585633277893066, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 792 }, { "completion_length": 594.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 887.0, "completions/max_terminated_length": 887.0, "completions/mean_length": 594.4166870117188, "completions/mean_terminated_length": 594.4166870117188, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 0.26899592944369066, "frac_reward_zero_std": 0.5, "grad_norm": 0.06149730086326599, "kl": 0.0, "learning_rate": 3.719806763285024e-07, "loss": 0.0001, "num_tokens": 27189796.0, "reward": 0.7166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 793 }, { "completion_length": 974.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3086.0, "completions/max_terminated_length": 3086.0, "completions/mean_length": 974.5, "completions/mean_terminated_length": 974.5, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 0.2693351424694708, "frac_reward_zero_std": 1.0, "grad_norm": 2.1048205667284492e-07, "kl": 0.0, "learning_rate": 3.7180814354727395e-07, "loss": 0.0, "num_tokens": 27212596.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 794 }, { "completion_length": 2349.83349609375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4858.0, "completions/mean_length": 2898.916748046875, "completions/mean_terminated_length": 2563.45458984375, "completions/min_length": 1054.0, "completions/min_terminated_length": 1054.0, "epoch": 0.26967435549525104, "frac_reward_zero_std": 0.5, "grad_norm": 0.6481123566627502, "kl": NaN, "learning_rate": 3.7163561076604555e-07, "loss": -0.0263, "num_tokens": 27249230.0, "reward": 0.770833432674408, "reward_std": 0.2123773992061615, "rewards/correctness_reward_func/mean": 0.4833333194255829, "rewards/correctness_reward_func/std": 0.43029236793518066, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 795 }, { "completion_length": 1115.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2305.0, "completions/max_terminated_length": 2305.0, "completions/mean_length": 1115.75, "completions/mean_terminated_length": 1115.75, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.2700135685210312, "frac_reward_zero_std": 0.0, "grad_norm": 0.08609779179096222, "kl": 0.0, "learning_rate": 3.7146307798481705e-07, "loss": 0.0008, "num_tokens": 27273443.0, "reward": 1.2708332538604736, "reward_std": 0.07144343107938766, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 796 }, { "completion_length": 1959.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3710.0, "completions/max_terminated_length": 3710.0, "completions/mean_length": 1959.416748046875, "completions/mean_terminated_length": 1959.416748046875, "completions/min_length": 1178.0, "completions/min_terminated_length": 1178.0, "epoch": 0.2703527815468114, "frac_reward_zero_std": 0.0, "grad_norm": 0.7615066766738892, "kl": 0.0, "learning_rate": 3.7129054520358866e-07, "loss": 0.0246, "num_tokens": 27307120.0, "reward": 0.7666666507720947, "reward_std": 0.4647580087184906, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 797 }, { "completion_length": 858.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1442.0, "completions/max_terminated_length": 1442.0, "completions/mean_length": 858.9166870117188, "completions/mean_terminated_length": 858.9166870117188, "completions/min_length": 540.0, "completions/min_terminated_length": 540.0, "epoch": 0.2706919945725916, "frac_reward_zero_std": 0.5, "grad_norm": 0.3698106110095978, "kl": 0.0, "learning_rate": 3.711180124223602e-07, "loss": 0.0015, "num_tokens": 27332229.0, "reward": 1.0208334922790527, "reward_std": 0.19391795992851257, "rewards/correctness_reward_func/mean": 0.7333333492279053, "rewards/correctness_reward_func/std": 0.23094011843204498, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 798 }, { "completion_length": 2415.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4826.0, "completions/max_terminated_length": 4826.0, "completions/mean_length": 2415.666748046875, "completions/mean_terminated_length": 2415.666748046875, "completions/min_length": 793.0, "completions/min_terminated_length": 793.0, "epoch": 0.2710312075983718, "frac_reward_zero_std": 0.5, "grad_norm": 0.6135579347610474, "kl": 0.0, "learning_rate": 3.709454796411318e-07, "loss": 0.0115, "num_tokens": 27375395.0, "reward": 1.1708333492279053, "reward_std": 0.22383961081504822, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 799 }, { "completion_length": 1509.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4804.0, "completions/max_terminated_length": 4804.0, "completions/mean_length": 1509.916748046875, "completions/mean_terminated_length": 1509.916748046875, "completions/min_length": 477.0, "completions/min_terminated_length": 477.0, "epoch": 0.27137042062415195, "frac_reward_zero_std": 1.0, "grad_norm": 1.8519099853619991e-07, "kl": 0.0, "learning_rate": 3.7077294685990337e-07, "loss": 0.0, "num_tokens": 27403858.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 800 }, { "completion_length": 1219.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2203.0, "completions/max_terminated_length": 2203.0, "completions/mean_length": 1219.166748046875, "completions/mean_terminated_length": 1219.166748046875, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "epoch": 0.27170963364993217, "frac_reward_zero_std": 0.5, "grad_norm": 0.10737733542919159, "kl": 0.0, "learning_rate": 3.706004140786749e-07, "loss": 0.0009, "num_tokens": 27433344.0, "reward": 1.1500000953674316, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.8500000834465027, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 801 }, { "completion_length": 606.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1109.0, "completions/max_terminated_length": 1109.0, "completions/mean_length": 606.6666870117188, "completions/mean_terminated_length": 606.6666870117188, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "epoch": 0.27204884667571233, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.7042788129744653e-07, "loss": 0.0, "num_tokens": 27451136.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 802 }, { "completion_length": 1556.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3046.0, "completions/max_terminated_length": 3046.0, "completions/mean_length": 1556.5, "completions/mean_terminated_length": 1556.5, "completions/min_length": 687.0, "completions/min_terminated_length": 687.0, "epoch": 0.27238805970149255, "frac_reward_zero_std": 0.5, "grad_norm": 0.5762483477592468, "kl": 0.0, "learning_rate": 3.702553485162181e-07, "loss": 0.0254, "num_tokens": 27480152.0, "reward": 1.1208332777023315, "reward_std": 0.27857524156570435, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.3892494738101959, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 803 }, { "completion_length": 1405.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2416.0, "completions/max_terminated_length": 2416.0, "completions/mean_length": 1405.0, "completions/mean_terminated_length": 1405.0, "completions/min_length": 628.0, "completions/min_terminated_length": 628.0, "epoch": 0.2727272727272727, "frac_reward_zero_std": 0.5, "grad_norm": 0.4991465210914612, "kl": 0.0, "learning_rate": 3.7008281573498964e-07, "loss": -0.0007, "num_tokens": 27506474.0, "reward": 0.46666666865348816, "reward_std": 0.2581988573074341, "rewards/correctness_reward_func/mean": 0.1666666716337204, "rewards/correctness_reward_func/std": 0.3892494738101959, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 804 }, { "completion_length": 1961.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3442.0, "completions/max_terminated_length": 3442.0, "completions/mean_length": 1961.3333740234375, "completions/mean_terminated_length": 1961.3333740234375, "completions/min_length": 982.0, "completions/min_terminated_length": 982.0, "epoch": 0.2730664857530529, "frac_reward_zero_std": 1.0, "grad_norm": 1.4712648521708616e-07, "kl": 0.0, "learning_rate": 3.699102829537612e-07, "loss": 0.0, "num_tokens": 27545088.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 805 }, { "completion_length": 1769.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3968.0, "completions/max_terminated_length": 3968.0, "completions/mean_length": 1769.0, "completions/mean_terminated_length": 1769.0, "completions/min_length": 532.0, "completions/min_terminated_length": 532.0, "epoch": 0.2734056987788331, "frac_reward_zero_std": 0.0, "grad_norm": 0.11471240222454071, "kl": 0.0, "learning_rate": 3.697377501725328e-07, "loss": -0.0003, "num_tokens": 27581796.0, "reward": 1.1666667461395264, "reward_std": 0.09559707343578339, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 806 }, { "completion_length": 1644.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3036.0, "completions/max_terminated_length": 3036.0, "completions/mean_length": 1644.75, "completions/mean_terminated_length": 1644.75, "completions/min_length": 803.0, "completions/min_terminated_length": 803.0, "epoch": 0.2737449118046133, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.695652173913043e-07, "loss": 0.0, "num_tokens": 27614289.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 807 }, { "completion_length": 1305.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3489.0, "completions/max_terminated_length": 3489.0, "completions/mean_length": 1305.0833740234375, "completions/mean_terminated_length": 1305.0833740234375, "completions/min_length": 634.0, "completions/min_terminated_length": 634.0, "epoch": 0.27408412483039346, "frac_reward_zero_std": 0.5, "grad_norm": 0.5795224905014038, "kl": 0.0, "learning_rate": 3.693926846100759e-07, "loss": 0.0063, "num_tokens": 27644182.0, "reward": 1.0499999523162842, "reward_std": 0.27386125922203064, "rewards/correctness_reward_func/mean": 0.75, "rewards/correctness_reward_func/std": 0.45226702094078064, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 808 }, { "completion_length": 1074.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1799.0, "completions/max_terminated_length": 1799.0, "completions/mean_length": 1074.166748046875, "completions/mean_terminated_length": 1074.166748046875, "completions/min_length": 687.0, "completions/min_terminated_length": 687.0, "epoch": 0.2744233378561737, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.6922015182884745e-07, "loss": 0.0, "num_tokens": 27668220.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 809 }, { "completion_length": 974.9167175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 1649.0, "completions/max_terminated_length": 1649.0, "completions/mean_length": 974.9166870117188, "completions/mean_terminated_length": 974.9166870117188, "completions/min_length": 667.0, "completions/min_terminated_length": 667.0, "epoch": 0.2747625508819539, "frac_reward_zero_std": 0.5, "grad_norm": 0.06648023426532745, "kl": 0.0, "learning_rate": 3.6904761904761906e-07, "loss": -0.0, "num_tokens": 27695639.0, "reward": 1.25, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 810 }, { "completion_length": 1228.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 1766.0, "completions/max_terminated_length": 1766.0, "completions/mean_length": 1228.666748046875, "completions/mean_terminated_length": 1228.666748046875, "completions/min_length": 449.0, "completions/min_terminated_length": 449.0, "epoch": 0.27510176390773405, "frac_reward_zero_std": 0.0, "grad_norm": 0.3744871914386749, "kl": 0.0, "learning_rate": 3.6887508626639056e-07, "loss": 0.0064, "num_tokens": 27720211.0, "reward": 1.1166667938232422, "reward_std": 0.24096208810806274, "rewards/correctness_reward_func/mean": 0.8166666030883789, "rewards/correctness_reward_func/std": 0.27579087018966675, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 811 }, { "completion_length": 1277.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 1874.0, "completions/max_terminated_length": 1874.0, "completions/mean_length": 1277.75, "completions/mean_terminated_length": 1277.75, "completions/min_length": 791.0, "completions/min_terminated_length": 791.0, "epoch": 0.27544097693351427, "frac_reward_zero_std": 0.5, "grad_norm": 0.08306435495615005, "kl": 0.0, "learning_rate": 3.6870255348516216e-07, "loss": 0.0017, "num_tokens": 27745120.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 812 }, { "completion_length": 1650.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3301.0, "completions/max_terminated_length": 3301.0, "completions/mean_length": 1650.916748046875, "completions/mean_terminated_length": 1650.916748046875, "completions/min_length": 864.0, "completions/min_terminated_length": 864.0, "epoch": 0.27578018995929443, "frac_reward_zero_std": 0.0, "grad_norm": 0.23837661743164062, "kl": 0.0, "learning_rate": 3.685300207039337e-07, "loss": 0.0016, "num_tokens": 27777681.0, "reward": 0.7833334803581238, "reward_std": 0.20412415266036987, "rewards/correctness_reward_func/mean": 0.4833333194255829, "rewards/correctness_reward_func/std": 0.43029236793518066, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 813 }, { "completion_length": 703.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1160.0, "completions/max_terminated_length": 1160.0, "completions/mean_length": 703.1666870117188, "completions/mean_terminated_length": 703.1666870117188, "completions/min_length": 340.0, "completions/min_terminated_length": 340.0, "epoch": 0.27611940298507465, "frac_reward_zero_std": 0.5, "grad_norm": 0.1073731854557991, "kl": 0.0, "learning_rate": 3.683574879227053e-07, "loss": -0.0005, "num_tokens": 27798101.0, "reward": 1.25, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 814 }, { "completion_length": 1182.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1865.0, "completions/max_terminated_length": 1865.0, "completions/mean_length": 1182.75, "completions/mean_terminated_length": 1182.75, "completions/min_length": 749.0, "completions/min_terminated_length": 749.0, "epoch": 0.2764586160108548, "frac_reward_zero_std": 0.5, "grad_norm": 0.0674026682972908, "kl": 0.0, "learning_rate": 3.681849551414769e-07, "loss": 0.0014, "num_tokens": 27818804.0, "reward": 1.2166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298573225736618, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 815 }, { "completion_length": 1002.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1656.0, "completions/max_terminated_length": 1656.0, "completions/mean_length": 1002.5833740234375, "completions/mean_terminated_length": 1002.5833740234375, "completions/min_length": 650.0, "completions/min_terminated_length": 650.0, "epoch": 0.276797829036635, "frac_reward_zero_std": 1.0, "grad_norm": 1.705409573560246e-07, "kl": 0.0, "learning_rate": 3.6801242236024843e-07, "loss": 0.0, "num_tokens": 27847707.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 816 }, { "completion_length": 2383.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 6308.0, "completions/max_terminated_length": 6308.0, "completions/mean_length": 2383.0, "completions/mean_terminated_length": 2383.0, "completions/min_length": 501.0, "completions/min_terminated_length": 501.0, "epoch": 0.2771370420624152, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.6783988957902003e-07, "loss": 0.0, "num_tokens": 27886677.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 817 }, { "completion_length": 2868.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 6046.0, "completions/max_terminated_length": 6046.0, "completions/mean_length": 2868.666748046875, "completions/mean_terminated_length": 2868.666748046875, "completions/min_length": 1894.0, "completions/min_terminated_length": 1894.0, "epoch": 0.2774762550881954, "frac_reward_zero_std": 0.5, "grad_norm": 0.09324238449335098, "kl": 0.0, "learning_rate": 3.6766735679779153e-07, "loss": -0.0024, "num_tokens": 27936599.0, "reward": 0.7166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 818 }, { "completion_length": 1085.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1940.0, "completions/max_terminated_length": 1940.0, "completions/mean_length": 1085.0833740234375, "completions/mean_terminated_length": 1085.0833740234375, "completions/min_length": 498.0, "completions/min_terminated_length": 498.0, "epoch": 0.27781546811397556, "frac_reward_zero_std": 1.0, "grad_norm": 1.5154705579334404e-07, "kl": 0.0, "learning_rate": 3.6749482401656314e-07, "loss": 0.0, "num_tokens": 27962658.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 819 }, { "completion_length": 2202.2501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 5019.0, "completions/max_terminated_length": 5019.0, "completions/mean_length": 2202.25, "completions/mean_terminated_length": 2202.25, "completions/min_length": 559.0, "completions/min_terminated_length": 559.0, "epoch": 0.2781546811397558, "frac_reward_zero_std": 0.0, "grad_norm": 0.5000991821289062, "kl": 0.0, "learning_rate": 3.673222912353347e-07, "loss": 0.0171, "num_tokens": 28000857.0, "reward": 0.8208333849906921, "reward_std": 0.23816029727458954, "rewards/correctness_reward_func/mean": 0.5333333015441895, "rewards/correctness_reward_func/std": 0.47736650705337524, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 820 }, { "completion_length": 3302.916748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6512.0, "completions/mean_length": 3852.0, "completions/mean_terminated_length": 3603.181884765625, "completions/min_length": 915.0, "completions/min_terminated_length": 915.0, "epoch": 0.27849389416553594, "frac_reward_zero_std": 0.5, "grad_norm": 0.2379552721977234, "kl": NaN, "learning_rate": 3.671497584541063e-07, "loss": -0.0582, "num_tokens": 28052630.0, "reward": 1.120833396911621, "reward_std": 0.19391795992851257, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.28069180250167847, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 821 }, { "completion_length": 2307.2501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 3861.0, "completions/max_terminated_length": 3861.0, "completions/mean_length": 2307.25, "completions/mean_terminated_length": 2307.25, "completions/min_length": 1338.0, "completions/min_terminated_length": 1338.0, "epoch": 0.27883310719131615, "frac_reward_zero_std": 0.0, "grad_norm": 0.6435486674308777, "kl": 0.0, "learning_rate": 3.669772256728778e-07, "loss": 0.0243, "num_tokens": 28093595.0, "reward": 0.8833333849906921, "reward_std": 0.29902368783950806, "rewards/correctness_reward_func/mean": 0.5833333134651184, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 822 }, { "completion_length": 2714.2501220703125, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5474.0, "completions/mean_length": 3263.33349609375, "completions/mean_terminated_length": 2961.0, "completions/min_length": 859.0, "completions/min_terminated_length": 859.0, "epoch": 0.2791723202170963, "frac_reward_zero_std": 0.5, "grad_norm": 0.2697756886482239, "kl": NaN, "learning_rate": 3.668046928916494e-07, "loss": -0.0554, "num_tokens": 28140860.0, "reward": 1.0416667461395264, "reward_std": 0.2457980364561081, "rewards/correctness_reward_func/mean": 0.7666667103767395, "rewards/correctness_reward_func/std": 0.2534608840942383, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 823 }, { "completion_length": 2003.3334350585938, "completions/clipped_ratio": 0.0, "completions/max_length": 4538.0, "completions/max_terminated_length": 4538.0, "completions/mean_length": 2003.3333740234375, "completions/mean_terminated_length": 2003.3333740234375, "completions/min_length": 551.0, "completions/min_terminated_length": 551.0, "epoch": 0.27951153324287653, "frac_reward_zero_std": 0.5, "grad_norm": 0.072422556579113, "kl": 0.0, "learning_rate": 3.6663216011042096e-07, "loss": -0.0009, "num_tokens": 28174818.0, "reward": 0.7166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 824 }, { "completion_length": 1121.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1858.0, "completions/max_terminated_length": 1858.0, "completions/mean_length": 1121.5, "completions/mean_terminated_length": 1121.5, "completions/min_length": 850.0, "completions/min_terminated_length": 850.0, "epoch": 0.2798507462686567, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.6645962732919256e-07, "loss": 0.0, "num_tokens": 28197018.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 825 }, { "completion_length": 1077.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3488.0, "completions/max_terminated_length": 3488.0, "completions/mean_length": 1077.416748046875, "completions/mean_terminated_length": 1077.416748046875, "completions/min_length": 404.0, "completions/min_terminated_length": 404.0, "epoch": 0.2801899592944369, "frac_reward_zero_std": 0.0, "grad_norm": 0.6790959239006042, "kl": 0.0, "learning_rate": 3.6628709454796406e-07, "loss": 0.0398, "num_tokens": 28224377.0, "reward": 0.9666666984558105, "reward_std": 0.3098386526107788, "rewards/correctness_reward_func/mean": 0.6666666269302368, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 826 }, { "completion_length": 690.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1055.0, "completions/max_terminated_length": 1055.0, "completions/mean_length": 690.6666870117188, "completions/mean_terminated_length": 690.6666870117188, "completions/min_length": 414.0, "completions/min_terminated_length": 414.0, "epoch": 0.2805291723202171, "frac_reward_zero_std": 1.0, "grad_norm": 1.7838985399976082e-07, "kl": 0.0, "learning_rate": 3.6611456176673567e-07, "loss": 0.0, "num_tokens": 28246261.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 827 }, { "completion_length": 2305.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 6477.0, "completions/max_terminated_length": 6477.0, "completions/mean_length": 2305.416748046875, "completions/mean_terminated_length": 2305.416748046875, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "epoch": 0.2808683853459973, "frac_reward_zero_std": 0.0, "grad_norm": 0.897094190120697, "kl": 0.0, "learning_rate": 3.659420289855072e-07, "loss": -0.0296, "num_tokens": 28284372.0, "reward": 0.8000000715255737, "reward_std": 0.24494895339012146, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.4472135901451111, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 828 }, { "completion_length": 3132.916748046875, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 6016.0, "completions/mean_length": 4780.1669921875, "completions/mean_terminated_length": 4177.22216796875, "completions/min_length": 2212.0, "completions/min_terminated_length": 2212.0, "epoch": 0.2812075983717775, "frac_reward_zero_std": 0.0, "grad_norm": 0.5745806097984314, "kl": NaN, "learning_rate": 3.657694962042788e-07, "loss": -0.0817, "num_tokens": 28334783.0, "reward": 0.9000000953674316, "reward_std": 0.3510836958885193, "rewards/correctness_reward_func/mean": 0.6500000357627869, "rewards/correctness_reward_func/std": 0.40113475918769836, "rewards/format_reward_func/mean": 0.2500000298023224, "rewards/format_reward_func/std": 0.11677484214305878, "step": 829 }, { "completion_length": 1060.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1373.0, "completions/max_terminated_length": 1373.0, "completions/mean_length": 1060.5833740234375, "completions/mean_terminated_length": 1060.5833740234375, "completions/min_length": 712.0, "completions/min_terminated_length": 712.0, "epoch": 0.28154681139755766, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.6559696342305033e-07, "loss": 0.0, "num_tokens": 28357872.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 830 }, { "completion_length": 1614.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3408.0, "completions/max_terminated_length": 3408.0, "completions/mean_length": 1614.5833740234375, "completions/mean_terminated_length": 1614.5833740234375, "completions/min_length": 524.0, "completions/min_terminated_length": 524.0, "epoch": 0.2818860244233379, "frac_reward_zero_std": 0.5, "grad_norm": 0.05196267366409302, "kl": 0.0, "learning_rate": 3.6542443064182193e-07, "loss": -0.0006, "num_tokens": 28387807.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 831 }, { "completion_length": 1116.5, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 3091.0, "completions/mean_length": 2763.75, "completions/mean_terminated_length": 1488.6666259765625, "completions/min_length": 654.0, "completions/min_terminated_length": 654.0, "epoch": 0.28222523744911804, "frac_reward_zero_std": 0.5, "grad_norm": 1.0311174392700195, "kl": NaN, "learning_rate": 3.6525189786059354e-07, "loss": -0.0566, "num_tokens": 28413589.0, "reward": 0.4749999940395355, "reward_std": 0.35601967573165894, "rewards/correctness_reward_func/mean": 0.25, "rewards/correctness_reward_func/std": 0.45226702094078064, "rewards/format_reward_func/mean": 0.22500000894069672, "rewards/format_reward_func/std": 0.13568010926246643, "step": 832 }, { "completion_length": 905.0000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1330.0, "completions/max_terminated_length": 1330.0, "completions/mean_length": 905.0, "completions/mean_terminated_length": 905.0, "completions/min_length": 630.0, "completions/min_terminated_length": 630.0, "epoch": 0.28256445047489825, "frac_reward_zero_std": 1.0, "grad_norm": 2.9492409225895244e-07, "kl": 0.0, "learning_rate": 3.6507936507936504e-07, "loss": 0.0, "num_tokens": 28437493.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 833 }, { "completion_length": 1761.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3188.0, "completions/max_terminated_length": 3188.0, "completions/mean_length": 1761.25, "completions/mean_terminated_length": 1761.25, "completions/min_length": 734.0, "completions/min_terminated_length": 734.0, "epoch": 0.2829036635006784, "frac_reward_zero_std": 0.5, "grad_norm": 0.07774775475263596, "kl": 0.0, "learning_rate": 3.6490683229813664e-07, "loss": -0.0002, "num_tokens": 28470862.0, "reward": 1.133333444595337, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 834 }, { "completion_length": 758.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1266.0, "completions/max_terminated_length": 1266.0, "completions/mean_length": 758.8333740234375, "completions/mean_terminated_length": 758.8333740234375, "completions/min_length": 454.0, "completions/min_terminated_length": 454.0, "epoch": 0.28324287652645863, "frac_reward_zero_std": 1.0, "grad_norm": 1.0557556606727303e-07, "kl": 0.0, "learning_rate": 3.647342995169082e-07, "loss": 0.0, "num_tokens": 28494248.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 835 }, { "completion_length": 2812.0834350585938, "completions/clipped_ratio": 0.0, "completions/max_length": 6122.0, "completions/max_terminated_length": 6122.0, "completions/mean_length": 2812.08349609375, "completions/mean_terminated_length": 2812.08349609375, "completions/min_length": 758.0, "completions/min_terminated_length": 758.0, "epoch": 0.2835820895522388, "frac_reward_zero_std": 0.5, "grad_norm": 0.5875731706619263, "kl": 0.0, "learning_rate": 3.645617667356798e-07, "loss": 0.0046, "num_tokens": 28534869.0, "reward": 0.9541667699813843, "reward_std": 0.19900795817375183, "rewards/correctness_reward_func/mean": 0.6666666865348816, "rewards/correctness_reward_func/std": 0.31139957904815674, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 836 }, { "completion_length": 1912.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5484.0, "completions/max_terminated_length": 5484.0, "completions/mean_length": 1912.25, "completions/mean_terminated_length": 1912.25, "completions/min_length": 714.0, "completions/min_terminated_length": 714.0, "epoch": 0.283921302578019, "frac_reward_zero_std": 1.0, "grad_norm": 1.538487879315653e-07, "kl": 0.0, "learning_rate": 3.643892339544513e-07, "loss": 0.0, "num_tokens": 28567734.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 837 }, { "completion_length": 942.0000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1422.0, "completions/max_terminated_length": 1422.0, "completions/mean_length": 942.0, "completions/mean_terminated_length": 942.0, "completions/min_length": 564.0, "completions/min_terminated_length": 564.0, "epoch": 0.28426051560379917, "frac_reward_zero_std": 0.5, "grad_norm": 0.08199096471071243, "kl": 0.0, "learning_rate": 3.642167011732229e-07, "loss": 0.0005, "num_tokens": 28595202.0, "reward": 0.7333333492279053, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.43333330750465393, "rewards/correctness_reward_func/std": 0.45792683959007263, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 838 }, { "completion_length": 2428.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 4072.0, "completions/max_terminated_length": 4072.0, "completions/mean_length": 2428.916748046875, "completions/mean_terminated_length": 2428.916748046875, "completions/min_length": 1415.0, "completions/min_terminated_length": 1415.0, "epoch": 0.2845997286295794, "frac_reward_zero_std": 0.5, "grad_norm": 0.8435646891593933, "kl": 0.0, "learning_rate": 3.6404416839199446e-07, "loss": 0.0061, "num_tokens": 28631879.0, "reward": 0.8666667342185974, "reward_std": 0.2581988573074341, "rewards/correctness_reward_func/mean": 0.5666666626930237, "rewards/correctness_reward_func/std": 0.42497774958610535, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 839 }, { "completion_length": 1648.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5064.0, "completions/max_terminated_length": 5064.0, "completions/mean_length": 1648.666748046875, "completions/mean_terminated_length": 1648.666748046875, "completions/min_length": 652.0, "completions/min_terminated_length": 652.0, "epoch": 0.28493894165535955, "frac_reward_zero_std": 0.5, "grad_norm": 0.09471841156482697, "kl": 0.0, "learning_rate": 3.63871635610766e-07, "loss": -0.0025, "num_tokens": 28661653.0, "reward": 1.1666667461395264, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 840 }, { "completion_length": 1834.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2920.0, "completions/max_terminated_length": 2920.0, "completions/mean_length": 1834.8333740234375, "completions/mean_terminated_length": 1834.8333740234375, "completions/min_length": 470.0, "completions/min_terminated_length": 470.0, "epoch": 0.28527815468113976, "frac_reward_zero_std": 0.5, "grad_norm": 0.0629253163933754, "kl": 0.0, "learning_rate": 3.6369910282953757e-07, "loss": 0.0021, "num_tokens": 28697795.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 841 }, { "completion_length": 1036.1667175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 1895.0, "completions/max_terminated_length": 1895.0, "completions/mean_length": 1036.166748046875, "completions/mean_terminated_length": 1036.166748046875, "completions/min_length": 578.0, "completions/min_terminated_length": 578.0, "epoch": 0.2856173677069199, "frac_reward_zero_std": 0.5, "grad_norm": 0.09502989798784256, "kl": 0.0, "learning_rate": 3.6352657004830917e-07, "loss": -0.0, "num_tokens": 28721341.0, "reward": 1.2333333492279053, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.9333333373069763, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 842 }, { "completion_length": 898.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1205.0, "completions/max_terminated_length": 1205.0, "completions/mean_length": 898.9166870117188, "completions/mean_terminated_length": 898.9166870117188, "completions/min_length": 597.0, "completions/min_terminated_length": 597.0, "epoch": 0.28595658073270014, "frac_reward_zero_std": 1.0, "grad_norm": 9.972598036256386e-08, "kl": 0.0, "learning_rate": 3.633540372670807e-07, "loss": 0.0, "num_tokens": 28743330.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 843 }, { "completion_length": 1803.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 5706.0, "completions/max_terminated_length": 5706.0, "completions/mean_length": 1803.75, "completions/mean_terminated_length": 1803.75, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.2862957937584803, "frac_reward_zero_std": 0.5, "grad_norm": 0.4343622028827667, "kl": 0.0, "learning_rate": 3.631815044858523e-07, "loss": 0.0283, "num_tokens": 28779027.0, "reward": 0.38333332538604736, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.0833333358168602, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 844 }, { "completion_length": 1681.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3518.0, "completions/max_terminated_length": 3518.0, "completions/mean_length": 1681.75, "completions/mean_terminated_length": 1681.75, "completions/min_length": 950.0, "completions/min_terminated_length": 950.0, "epoch": 0.2866350067842605, "frac_reward_zero_std": 0.5, "grad_norm": 0.07720570266246796, "kl": 0.0, "learning_rate": 3.6300897170462383e-07, "loss": -0.0016, "num_tokens": 28814628.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 845 }, { "completion_length": 1228.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 2419.0, "completions/max_terminated_length": 2419.0, "completions/mean_length": 1228.666748046875, "completions/mean_terminated_length": 1228.666748046875, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "epoch": 0.28697421981004073, "frac_reward_zero_std": 1.0, "grad_norm": 1.1242604358585595e-07, "kl": 0.0, "learning_rate": 3.6283643892339544e-07, "loss": 0.0, "num_tokens": 28844198.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 846 }, { "completion_length": 2611.666748046875, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 6019.0, "completions/mean_length": 3709.83349609375, "completions/mean_terminated_length": 3134.0, "completions/min_length": 1123.0, "completions/min_terminated_length": 1123.0, "epoch": 0.2873134328358209, "frac_reward_zero_std": 0.0, "grad_norm": 0.6555944085121155, "kl": NaN, "learning_rate": 3.6266390614216704e-07, "loss": -0.0615, "num_tokens": 28886206.0, "reward": 0.7500001192092896, "reward_std": 0.280963659286499, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.4472135901451111, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.11677484214305878, "step": 847 }, { "completion_length": 774.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2040.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 774.75, "completions/mean_terminated_length": 774.75, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.2876526458616011, "frac_reward_zero_std": 1.0, "grad_norm": 1.7893466974783223e-07, "kl": 0.0, "learning_rate": 3.6249137336093854e-07, "loss": 0.0, "num_tokens": 28903849.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 848 }, { "completion_length": 2093.2501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 4633.0, "completions/max_terminated_length": 4633.0, "completions/mean_length": 2093.25, "completions/mean_terminated_length": 2093.25, "completions/min_length": 529.0, "completions/min_terminated_length": 529.0, "epoch": 0.28799185888738127, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.6231884057971015e-07, "loss": 0.0, "num_tokens": 28938424.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 849 }, { "completion_length": 2106.7501220703125, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4592.0, "completions/mean_length": 2655.83349609375, "completions/mean_terminated_length": 2298.272705078125, "completions/min_length": 1069.0, "completions/min_terminated_length": 1069.0, "epoch": 0.2883310719131615, "frac_reward_zero_std": 0.5, "grad_norm": 0.1451241672039032, "kl": NaN, "learning_rate": 3.621463077984817e-07, "loss": -0.0101, "num_tokens": 28975171.0, "reward": 0.6750000715255737, "reward_std": 0.06123724579811096, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 850 }, { "completion_length": 1222.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2122.0, "completions/max_terminated_length": 2122.0, "completions/mean_length": 1222.5833740234375, "completions/mean_terminated_length": 1222.5833740234375, "completions/min_length": 738.0, "completions/min_terminated_length": 738.0, "epoch": 0.28867028493894165, "frac_reward_zero_std": 1.0, "grad_norm": 1.7627006343445828e-07, "kl": 0.0, "learning_rate": 3.6197377501725325e-07, "loss": 0.0, "num_tokens": 29000672.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 851 }, { "completion_length": 1198.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3363.0, "completions/max_terminated_length": 3363.0, "completions/mean_length": 1198.0, "completions/mean_terminated_length": 1198.0, "completions/min_length": 603.0, "completions/min_terminated_length": 603.0, "epoch": 0.28900949796472186, "frac_reward_zero_std": 0.5, "grad_norm": 0.040780119597911835, "kl": 0.0, "learning_rate": 3.618012422360248e-07, "loss": -0.0002, "num_tokens": 29028500.0, "reward": 0.2875000238418579, "reward_std": 0.03061862289905548, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 852 }, { "completion_length": 3095.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5384.0, "completions/max_terminated_length": 5384.0, "completions/mean_length": 3095.416748046875, "completions/mean_terminated_length": 3095.416748046875, "completions/min_length": 977.0, "completions/min_terminated_length": 977.0, "epoch": 0.289348710990502, "frac_reward_zero_std": 0.5, "grad_norm": 0.6323304772377014, "kl": 0.0, "learning_rate": 3.616287094547964e-07, "loss": -0.0108, "num_tokens": 29076835.0, "reward": 0.7166666984558105, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.5149286985397339, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 853 }, { "completion_length": 938.5000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1590.0, "completions/max_terminated_length": 1590.0, "completions/mean_length": 938.5, "completions/mean_terminated_length": 938.5, "completions/min_length": 531.0, "completions/min_terminated_length": 531.0, "epoch": 0.28968792401628224, "frac_reward_zero_std": 1.0, "grad_norm": 9.98534375185045e-08, "kl": 0.0, "learning_rate": 3.6145617667356797e-07, "loss": 0.0, "num_tokens": 29098753.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 854 }, { "completion_length": 1426.1667175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 3073.0, "completions/max_terminated_length": 3073.0, "completions/mean_length": 1426.166748046875, "completions/mean_terminated_length": 1426.166748046875, "completions/min_length": 518.0, "completions/min_terminated_length": 518.0, "epoch": 0.2900271370420624, "frac_reward_zero_std": 1.0, "grad_norm": 2.5459246444370365e-07, "kl": 0.0, "learning_rate": 3.612836438923395e-07, "loss": 0.0, "num_tokens": 29129841.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 855 }, { "completion_length": 833.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1799.0, "completions/max_terminated_length": 1799.0, "completions/mean_length": 833.8333740234375, "completions/mean_terminated_length": 833.8333740234375, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "epoch": 0.2903663500678426, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.6111111111111107e-07, "loss": 0.0, "num_tokens": 29150641.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 856 }, { "completion_length": 1173.1667175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 3143.0, "completions/max_terminated_length": 3143.0, "completions/mean_length": 1173.166748046875, "completions/mean_terminated_length": 1173.166748046875, "completions/min_length": 670.0, "completions/min_terminated_length": 670.0, "epoch": 0.2907055630936228, "frac_reward_zero_std": 0.5, "grad_norm": 0.09201031923294067, "kl": 0.0, "learning_rate": 3.609385783298827e-07, "loss": 0.0009, "num_tokens": 29175879.0, "reward": 1.25, "reward_std": 0.05477222427725792, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 857 }, { "completion_length": 1476.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2281.0, "completions/max_terminated_length": 2281.0, "completions/mean_length": 1476.416748046875, "completions/mean_terminated_length": 1476.416748046875, "completions/min_length": 635.0, "completions/min_terminated_length": 635.0, "epoch": 0.291044776119403, "frac_reward_zero_std": 1.0, "grad_norm": 1.1626907081563331e-07, "kl": 0.0, "learning_rate": 3.6076604554865423e-07, "loss": 0.0, "num_tokens": 29205764.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 858 }, { "completion_length": 2051.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3636.0, "completions/max_terminated_length": 3636.0, "completions/mean_length": 2051.33349609375, "completions/mean_terminated_length": 2051.33349609375, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "epoch": 0.29138398914518315, "frac_reward_zero_std": 0.5, "grad_norm": 0.4645165503025055, "kl": 0.0, "learning_rate": 3.605935127674258e-07, "loss": -0.0135, "num_tokens": 29241348.0, "reward": 0.38333332538604736, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.0833333358168602, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 859 }, { "completion_length": 2124.3334350585938, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5150.0, "completions/mean_length": 2673.416748046875, "completions/mean_terminated_length": 2317.45458984375, "completions/min_length": 672.0, "completions/min_terminated_length": 672.0, "epoch": 0.29172320217096337, "frac_reward_zero_std": 0.5, "grad_norm": 0.17607992887496948, "kl": NaN, "learning_rate": 3.6042097998619734e-07, "loss": -0.0124, "num_tokens": 29280682.0, "reward": 0.7749999761581421, "reward_std": 0.06123724579811096, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 860 }, { "completion_length": 2468.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5737.0, "completions/max_terminated_length": 5737.0, "completions/mean_length": 2468.416748046875, "completions/mean_terminated_length": 2468.416748046875, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "epoch": 0.29206241519674353, "frac_reward_zero_std": 0.5, "grad_norm": 0.25238487124443054, "kl": 0.0, "learning_rate": 3.6024844720496894e-07, "loss": 0.0017, "num_tokens": 29323107.0, "reward": 1.1166667938232422, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.8166666030883789, "rewards/correctness_reward_func/std": 0.27579087018966675, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 861 }, { "completion_length": 1086.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2235.0, "completions/max_terminated_length": 2235.0, "completions/mean_length": 1086.0833740234375, "completions/mean_terminated_length": 1086.0833740234375, "completions/min_length": 657.0, "completions/min_terminated_length": 657.0, "epoch": 0.29240162822252375, "frac_reward_zero_std": 0.5, "grad_norm": 0.09536635130643845, "kl": 0.0, "learning_rate": 3.600759144237405e-07, "loss": -0.0001, "num_tokens": 29345398.0, "reward": 1.1666667461395264, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 862 }, { "completion_length": 2525.0833740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4973.0, "completions/mean_length": 3074.166748046875, "completions/mean_terminated_length": 2754.636474609375, "completions/min_length": 1133.0, "completions/min_terminated_length": 1133.0, "epoch": 0.29274084124830396, "frac_reward_zero_std": 0.0, "grad_norm": 0.4000323414802551, "kl": NaN, "learning_rate": 3.5990338164251205e-07, "loss": -0.0039, "num_tokens": 29389211.0, "reward": 0.6416666507720947, "reward_std": 0.2474271059036255, "rewards/correctness_reward_func/mean": 0.36666664481163025, "rewards/correctness_reward_func/std": 0.45792683959007263, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 863 }, { "completion_length": 940.5833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1529.0, "completions/max_terminated_length": 1529.0, "completions/mean_length": 940.5833740234375, "completions/mean_terminated_length": 940.5833740234375, "completions/min_length": 399.0, "completions/min_terminated_length": 399.0, "epoch": 0.2930800542740841, "frac_reward_zero_std": 0.5, "grad_norm": 0.42567434906959534, "kl": 0.0, "learning_rate": 3.5973084886128365e-07, "loss": 0.0036, "num_tokens": 29410548.0, "reward": 1.058333396911621, "reward_std": 0.2239791601896286, "rewards/correctness_reward_func/mean": 0.7833333015441895, "rewards/correctness_reward_func/std": 0.26227444410324097, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 864 }, { "completion_length": 1378.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2369.0, "completions/max_terminated_length": 2369.0, "completions/mean_length": 1378.25, "completions/mean_terminated_length": 1378.25, "completions/min_length": 692.0, "completions/min_terminated_length": 692.0, "epoch": 0.29341926729986434, "frac_reward_zero_std": 0.5, "grad_norm": 0.08405417203903198, "kl": 0.0, "learning_rate": 3.595583160800552e-07, "loss": -0.0007, "num_tokens": 29439381.0, "reward": 1.133333444595337, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 865 }, { "completion_length": 1817.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3688.0, "completions/max_terminated_length": 3688.0, "completions/mean_length": 1817.5, "completions/mean_terminated_length": 1817.5, "completions/min_length": 645.0, "completions/min_terminated_length": 645.0, "epoch": 0.2937584803256445, "frac_reward_zero_std": 1.0, "grad_norm": 2.4541046172998904e-07, "kl": 0.0, "learning_rate": 3.5938578329882676e-07, "loss": 0.0, "num_tokens": 29472459.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 866 }, { "completion_length": 1045.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3683.0, "completions/max_terminated_length": 3683.0, "completions/mean_length": 1045.75, "completions/mean_terminated_length": 1045.75, "completions/min_length": 443.0, "completions/min_terminated_length": 443.0, "epoch": 0.2940976933514247, "frac_reward_zero_std": 0.5, "grad_norm": 0.0929461270570755, "kl": 0.0, "learning_rate": 3.592132505175983e-07, "loss": -0.0029, "num_tokens": 29493606.0, "reward": 0.7666666507720947, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 867 }, { "completion_length": 1191.0833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 2802.0, "completions/max_terminated_length": 2802.0, "completions/mean_length": 1191.0833740234375, "completions/mean_terminated_length": 1191.0833740234375, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "epoch": 0.2944369063772049, "frac_reward_zero_std": 0.5, "grad_norm": 0.08606947213411331, "kl": 0.0, "learning_rate": 3.590407177363699e-07, "loss": -0.0002, "num_tokens": 29520937.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 868 }, { "completion_length": 2072.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 4117.0, "completions/max_terminated_length": 4117.0, "completions/mean_length": 2072.5, "completions/mean_terminated_length": 2072.5, "completions/min_length": 475.0, "completions/min_terminated_length": 475.0, "epoch": 0.2947761194029851, "frac_reward_zero_std": 0.0, "grad_norm": 0.15073060989379883, "kl": 0.0, "learning_rate": 3.5886818495514147e-07, "loss": -0.0003, "num_tokens": 29560627.0, "reward": 1.2333333492279053, "reward_std": 0.0955970510840416, "rewards/correctness_reward_func/mean": 0.9333333373069763, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 869 }, { "completion_length": 1233.9166870117188, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 2920.0, "completions/mean_length": 2332.08349609375, "completions/mean_terminated_length": 1480.7000732421875, "completions/min_length": 969.0, "completions/min_terminated_length": 969.0, "epoch": 0.29511533242876525, "frac_reward_zero_std": 0.5, "grad_norm": 0.3028711974620819, "kl": NaN, "learning_rate": 3.58695652173913e-07, "loss": -0.0307, "num_tokens": 29584182.0, "reward": 1.0166666507720947, "reward_std": 0.2840188145637512, "rewards/correctness_reward_func/mean": 0.7666666507720947, "rewards/correctness_reward_func/std": 0.3700941801071167, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.11677484214305878, "step": 870 }, { "completion_length": 1574.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 6154.0, "completions/max_terminated_length": 6154.0, "completions/mean_length": 1574.416748046875, "completions/mean_terminated_length": 1574.416748046875, "completions/min_length": 503.0, "completions/min_terminated_length": 503.0, "epoch": 0.29545454545454547, "frac_reward_zero_std": 0.0, "grad_norm": 0.661939263343811, "kl": 0.0, "learning_rate": 3.585231193926846e-07, "loss": 0.0737, "num_tokens": 29614547.0, "reward": 1.1166666746139526, "reward_std": 0.29902368783950806, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.3857303261756897, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 871 }, { "completion_length": 1510.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3076.0, "completions/max_terminated_length": 3076.0, "completions/mean_length": 1510.666748046875, "completions/mean_terminated_length": 1510.666748046875, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "epoch": 0.29579375848032563, "frac_reward_zero_std": 0.5, "grad_norm": 0.5096197128295898, "kl": 0.0, "learning_rate": 3.583505866114562e-07, "loss": 0.0023, "num_tokens": 29647225.0, "reward": 1.2166666984558105, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 872 }, { "completion_length": 3703.3333740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6485.0, "completions/mean_length": 4252.4169921875, "completions/mean_terminated_length": 4040.0, "completions/min_length": 1202.0, "completions/min_terminated_length": 1202.0, "epoch": 0.29613297150610585, "frac_reward_zero_std": 0.5, "grad_norm": 0.2425873577594757, "kl": NaN, "learning_rate": 3.581780538302277e-07, "loss": -0.0457, "num_tokens": 29703071.0, "reward": 0.625, "reward_std": 0.23611438274383545, "rewards/correctness_reward_func/mean": 0.3499999940395355, "rewards/correctness_reward_func/std": 0.4358898997306824, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 873 }, { "completion_length": 1448.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2933.0, "completions/max_terminated_length": 2933.0, "completions/mean_length": 1448.666748046875, "completions/mean_terminated_length": 1448.666748046875, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "epoch": 0.296472184531886, "frac_reward_zero_std": 0.5, "grad_norm": 0.0715579092502594, "kl": 0.0, "learning_rate": 3.580055210489993e-07, "loss": -0.0002, "num_tokens": 29735509.0, "reward": 1.1666667461395264, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 874 }, { "completion_length": 821.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1282.0, "completions/max_terminated_length": 1282.0, "completions/mean_length": 821.5, "completions/mean_terminated_length": 821.5, "completions/min_length": 485.0, "completions/min_terminated_length": 485.0, "epoch": 0.2968113975576662, "frac_reward_zero_std": 0.5, "grad_norm": 0.07250086963176727, "kl": 0.0, "learning_rate": 3.5783298826777084e-07, "loss": -0.0009, "num_tokens": 29759443.0, "reward": 1.1666667461395264, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 875 }, { "completion_length": 1254.8333740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 3132.0, "completions/mean_length": 1803.916748046875, "completions/mean_terminated_length": 1368.9091796875, "completions/min_length": 446.0, "completions/min_terminated_length": 446.0, "epoch": 0.2971506105834464, "frac_reward_zero_std": 0.5, "grad_norm": 0.47821852564811707, "kl": NaN, "learning_rate": 3.5766045548654245e-07, "loss": -0.0038, "num_tokens": 29784725.0, "reward": 0.6083333492279053, "reward_std": 0.3006936311721802, "rewards/correctness_reward_func/mean": 0.3333333432674408, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 876 }, { "completion_length": 1243.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 1934.0, "completions/max_terminated_length": 1934.0, "completions/mean_length": 1243.75, "completions/mean_terminated_length": 1243.75, "completions/min_length": 563.0, "completions/min_terminated_length": 563.0, "epoch": 0.2974898236092266, "frac_reward_zero_std": 0.0, "grad_norm": 0.5256324410438538, "kl": 0.0, "learning_rate": 3.5748792270531395e-07, "loss": -0.0066, "num_tokens": 29811872.0, "reward": 1.0916666984558105, "reward_std": 0.26536136865615845, "rewards/correctness_reward_func/mean": 0.8166666030883789, "rewards/correctness_reward_func/std": 0.27579087018966675, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 877 }, { "completion_length": 1533.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5486.0, "completions/max_terminated_length": 5486.0, "completions/mean_length": 1533.0, "completions/mean_terminated_length": 1533.0, "completions/min_length": 413.0, "completions/min_terminated_length": 413.0, "epoch": 0.29782903663500676, "frac_reward_zero_std": 0.5, "grad_norm": 0.3232329785823822, "kl": 0.0, "learning_rate": 3.5731538992408555e-07, "loss": -0.0113, "num_tokens": 29842292.0, "reward": 0.6333333253860474, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.3333333432674408, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 878 }, { "completion_length": 1707.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3730.0, "completions/max_terminated_length": 3730.0, "completions/mean_length": 1707.3333740234375, "completions/mean_terminated_length": 1707.3333740234375, "completions/min_length": 744.0, "completions/min_terminated_length": 744.0, "epoch": 0.298168249660787, "frac_reward_zero_std": 1.0, "grad_norm": 1.7474680191753578e-07, "kl": 0.0, "learning_rate": 3.5714285714285716e-07, "loss": 0.0, "num_tokens": 29872560.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 879 }, { "completion_length": 1907.8334350585938, "completions/clipped_ratio": 0.0, "completions/max_length": 5257.0, "completions/max_terminated_length": 5257.0, "completions/mean_length": 1907.8333740234375, "completions/mean_terminated_length": 1907.8333740234375, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 0.29850746268656714, "frac_reward_zero_std": 1.0, "grad_norm": 3.42220346283284e-07, "kl": 0.0, "learning_rate": 3.569703243616287e-07, "loss": 0.0, "num_tokens": 29907760.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 880 }, { "completion_length": 1137.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1893.0, "completions/max_terminated_length": 1893.0, "completions/mean_length": 1137.8333740234375, "completions/mean_terminated_length": 1137.8333740234375, "completions/min_length": 456.0, "completions/min_terminated_length": 456.0, "epoch": 0.29884667571234735, "frac_reward_zero_std": 0.5, "grad_norm": 0.066347137093544, "kl": 0.0, "learning_rate": 3.5679779158040026e-07, "loss": 0.0011, "num_tokens": 29931818.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 881 }, { "completion_length": 1808.0834350585938, "completions/clipped_ratio": 0.0, "completions/max_length": 3643.0, "completions/max_terminated_length": 3643.0, "completions/mean_length": 1808.0833740234375, "completions/mean_terminated_length": 1808.0833740234375, "completions/min_length": 686.0, "completions/min_terminated_length": 686.0, "epoch": 0.29918588873812757, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.566252587991718e-07, "loss": 0.0, "num_tokens": 29967603.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 882 }, { "completion_length": 1442.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1852.0, "completions/max_terminated_length": 1852.0, "completions/mean_length": 1442.0833740234375, "completions/mean_terminated_length": 1442.0833740234375, "completions/min_length": 820.0, "completions/min_terminated_length": 820.0, "epoch": 0.29952510176390773, "frac_reward_zero_std": 0.0, "grad_norm": 0.5057232975959778, "kl": 0.0, "learning_rate": 3.564527260179434e-07, "loss": 0.0043, "num_tokens": 30000094.0, "reward": 1.1041667461395264, "reward_std": 0.27090632915496826, "rewards/correctness_reward_func/mean": 0.8166666030883789, "rewards/correctness_reward_func/std": 0.27579087018966675, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 883 }, { "completion_length": 1418.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2682.0, "completions/max_terminated_length": 2682.0, "completions/mean_length": 1418.3333740234375, "completions/mean_terminated_length": 1418.3333740234375, "completions/min_length": 675.0, "completions/min_terminated_length": 675.0, "epoch": 0.29986431478968795, "frac_reward_zero_std": 1.0, "grad_norm": 2.0655014054682397e-07, "kl": 0.0, "learning_rate": 3.562801932367149e-07, "loss": 0.0, "num_tokens": 30025184.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 884 }, { "completion_length": 926.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1558.0, "completions/max_terminated_length": 1558.0, "completions/mean_length": 926.25, "completions/mean_terminated_length": 926.25, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 0.3002035278154681, "frac_reward_zero_std": 0.5, "grad_norm": 0.08455709367990494, "kl": 0.0, "learning_rate": 3.5610766045548653e-07, "loss": 0.0011, "num_tokens": 30046109.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 885 }, { "completion_length": 983.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1365.0, "completions/max_terminated_length": 1365.0, "completions/mean_length": 983.0, "completions/mean_terminated_length": 983.0, "completions/min_length": 604.0, "completions/min_terminated_length": 604.0, "epoch": 0.3005427408412483, "frac_reward_zero_std": 1.0, "grad_norm": 2.2232033813907037e-07, "kl": 0.0, "learning_rate": 3.559351276742581e-07, "loss": 0.0, "num_tokens": 30072803.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 886 }, { "completion_length": 1092.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1479.0, "completions/max_terminated_length": 1479.0, "completions/mean_length": 1092.666748046875, "completions/mean_terminated_length": 1092.666748046875, "completions/min_length": 793.0, "completions/min_terminated_length": 793.0, "epoch": 0.3008819538670285, "frac_reward_zero_std": 0.0, "grad_norm": 0.1025518849492073, "kl": 0.0, "learning_rate": 3.557625948930297e-07, "loss": -0.0005, "num_tokens": 30095977.0, "reward": 1.1666667461395264, "reward_std": 0.09559705853462219, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 887 }, { "completion_length": 1043.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1950.0, "completions/max_terminated_length": 1950.0, "completions/mean_length": 1043.3333740234375, "completions/mean_terminated_length": 1043.3333740234375, "completions/min_length": 683.0, "completions/min_terminated_length": 683.0, "epoch": 0.3012211668928087, "frac_reward_zero_std": 1.0, "grad_norm": 1.7373888283600536e-07, "kl": 0.0, "learning_rate": 3.555900621118012e-07, "loss": 0.0, "num_tokens": 30121943.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 888 }, { "completion_length": 1952.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3501.0, "completions/max_terminated_length": 3501.0, "completions/mean_length": 1952.25, "completions/mean_terminated_length": 1952.25, "completions/min_length": 539.0, "completions/min_terminated_length": 539.0, "epoch": 0.30156037991858886, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.554175293305728e-07, "loss": 0.0, "num_tokens": 30153266.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 889 }, { "completion_length": 1630.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3602.0, "completions/max_terminated_length": 3602.0, "completions/mean_length": 1630.666748046875, "completions/mean_terminated_length": 1630.666748046875, "completions/min_length": 514.0, "completions/min_terminated_length": 514.0, "epoch": 0.3018995929443691, "frac_reward_zero_std": 0.5, "grad_norm": 0.09205930680036545, "kl": 0.0, "learning_rate": 3.5524499654934434e-07, "loss": -0.0002, "num_tokens": 30189412.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 890 }, { "completion_length": 1857.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3834.0, "completions/max_terminated_length": 3834.0, "completions/mean_length": 1857.916748046875, "completions/mean_terminated_length": 1857.916748046875, "completions/min_length": 929.0, "completions/min_terminated_length": 929.0, "epoch": 0.30223880597014924, "frac_reward_zero_std": 1.0, "grad_norm": 1.2736225585285865e-07, "kl": 0.0, "learning_rate": 3.5507246376811595e-07, "loss": 0.0, "num_tokens": 30221493.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 891 }, { "completion_length": 1551.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2618.0, "completions/max_terminated_length": 2618.0, "completions/mean_length": 1551.75, "completions/mean_terminated_length": 1551.75, "completions/min_length": 926.0, "completions/min_terminated_length": 926.0, "epoch": 0.30257801899592945, "frac_reward_zero_std": 0.5, "grad_norm": 0.07093626260757446, "kl": 0.0, "learning_rate": 3.5489993098688745e-07, "loss": 0.001, "num_tokens": 30250272.0, "reward": 0.7833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 892 }, { "completion_length": 2763.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 5376.0, "completions/max_terminated_length": 5376.0, "completions/mean_length": 2763.58349609375, "completions/mean_terminated_length": 2763.58349609375, "completions/min_length": 1006.0, "completions/min_terminated_length": 1006.0, "epoch": 0.3029172320217096, "frac_reward_zero_std": 0.0, "grad_norm": 0.8445454835891724, "kl": 0.0, "learning_rate": 3.5472739820565906e-07, "loss": 0.0196, "num_tokens": 30297445.0, "reward": 1.0333333015441895, "reward_std": 0.4581989049911499, "rewards/correctness_reward_func/mean": 0.7333333492279053, "rewards/correctness_reward_func/std": 0.44585633277893066, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 893 }, { "completion_length": 744.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1564.0, "completions/max_terminated_length": 1564.0, "completions/mean_length": 744.4166870117188, "completions/mean_terminated_length": 744.4166870117188, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "epoch": 0.30325644504748983, "frac_reward_zero_std": 0.5, "grad_norm": 0.08562467247247696, "kl": 0.0, "learning_rate": 3.5455486542443066e-07, "loss": 0.0003, "num_tokens": 30324564.0, "reward": 1.2666666507720947, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 894 }, { "completion_length": 1995.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 4017.0, "completions/max_terminated_length": 4017.0, "completions/mean_length": 1995.916748046875, "completions/mean_terminated_length": 1995.916748046875, "completions/min_length": 585.0, "completions/min_terminated_length": 585.0, "epoch": 0.30359565807327, "frac_reward_zero_std": 0.5, "grad_norm": 0.11056003719568253, "kl": 0.0, "learning_rate": 3.543823326432022e-07, "loss": 0.0017, "num_tokens": 30357587.0, "reward": 0.75, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.45000001788139343, "rewards/correctness_reward_func/std": 0.47577688097953796, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 895 }, { "completion_length": 1346.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 6012.0, "completions/max_terminated_length": 6012.0, "completions/mean_length": 1346.416748046875, "completions/mean_terminated_length": 1346.416748046875, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "epoch": 0.3039348710990502, "frac_reward_zero_std": 0.5, "grad_norm": 0.997124433517456, "kl": 0.0, "learning_rate": 3.5420979986197377e-07, "loss": 0.0458, "num_tokens": 30385786.0, "reward": 1.2000000476837158, "reward_std": 0.20000001788139343, "rewards/correctness_reward_func/mean": 0.9000000357627869, "rewards/correctness_reward_func/std": 0.28919950127601624, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 896 }, { "completion_length": 1527.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 5303.0, "completions/max_terminated_length": 5303.0, "completions/mean_length": 1527.75, "completions/mean_terminated_length": 1527.75, "completions/min_length": 634.0, "completions/min_terminated_length": 634.0, "epoch": 0.30427408412483037, "frac_reward_zero_std": 0.5, "grad_norm": 0.4941687285900116, "kl": 0.0, "learning_rate": 3.540372670807453e-07, "loss": 0.0035, "num_tokens": 30416203.0, "reward": 0.7166666984558105, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.5149286985397339, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 897 }, { "completion_length": 1980.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6500.0, "completions/max_terminated_length": 6500.0, "completions/mean_length": 1980.0, "completions/mean_terminated_length": 1980.0, "completions/min_length": 618.0, "completions/min_terminated_length": 618.0, "epoch": 0.3046132971506106, "frac_reward_zero_std": 0.5, "grad_norm": 0.8511844873428345, "kl": 0.0, "learning_rate": 3.538647342995169e-07, "loss": 0.0584, "num_tokens": 30449485.0, "reward": 0.7166666984558105, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.5149286985397339, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 898 }, { "completion_length": 921.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 1993.0, "completions/max_terminated_length": 1993.0, "completions/mean_length": 921.75, "completions/mean_terminated_length": 921.75, "completions/min_length": 423.0, "completions/min_terminated_length": 423.0, "epoch": 0.3049525101763908, "frac_reward_zero_std": 1.0, "grad_norm": 2.2145842137888394e-07, "kl": 0.0, "learning_rate": 3.536922015182884e-07, "loss": 0.0, "num_tokens": 30472606.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 899 }, { "completion_length": 2026.166748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5969.0, "completions/mean_length": 2575.25, "completions/mean_terminated_length": 2210.36376953125, "completions/min_length": 546.0, "completions/min_terminated_length": 546.0, "epoch": 0.30529172320217096, "frac_reward_zero_std": 0.5, "grad_norm": 0.5869353413581848, "kl": NaN, "learning_rate": 3.5351966873706003e-07, "loss": -0.0101, "num_tokens": 30508272.0, "reward": 0.9916666746139526, "reward_std": 0.2727941870689392, "rewards/correctness_reward_func/mean": 0.7166666984558105, "rewards/correctness_reward_func/std": 0.4386618733406067, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 900 }, { "completion_length": 2621.33349609375, "completions/clipped_ratio": 0.0, "completions/max_length": 4830.0, "completions/max_terminated_length": 4830.0, "completions/mean_length": 2621.33349609375, "completions/mean_terminated_length": 2621.33349609375, "completions/min_length": 616.0, "completions/min_terminated_length": 616.0, "epoch": 0.3056309362279512, "frac_reward_zero_std": 1.0, "grad_norm": 2.387363906564133e-07, "kl": 0.0, "learning_rate": 3.533471359558316e-07, "loss": 0.0, "num_tokens": 30554278.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 901 }, { "completion_length": 1101.6666870117188, "completions/clipped_ratio": 0.33333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5642.0, "completions/mean_length": 3298.0, "completions/mean_terminated_length": 1652.5, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "epoch": 0.30597014925373134, "frac_reward_zero_std": 0.0, "grad_norm": 0.21892796456813812, "kl": NaN, "learning_rate": 3.531746031746032e-07, "loss": -0.0191, "num_tokens": 30574698.0, "reward": 0.7083333730697632, "reward_std": 0.10790684819221497, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.22500000894069672, "rewards/format_reward_func/std": 0.11965861916542053, "step": 902 }, { "completion_length": 1977.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3584.0, "completions/max_terminated_length": 3584.0, "completions/mean_length": 1977.3333740234375, "completions/mean_terminated_length": 1977.3333740234375, "completions/min_length": 894.0, "completions/min_terminated_length": 894.0, "epoch": 0.30630936227951155, "frac_reward_zero_std": 0.0, "grad_norm": 0.15072372555732727, "kl": 0.0, "learning_rate": 3.530020703933747e-07, "loss": -0.0001, "num_tokens": 30611248.0, "reward": 1.2000000476837158, "reward_std": 0.10327951610088348, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 903 }, { "completion_length": 1630.7501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 3331.0, "completions/max_terminated_length": 3331.0, "completions/mean_length": 1630.75, "completions/mean_terminated_length": 1630.75, "completions/min_length": 589.0, "completions/min_terminated_length": 589.0, "epoch": 0.3066485753052917, "frac_reward_zero_std": 0.5, "grad_norm": 0.10795415192842484, "kl": 0.0, "learning_rate": 3.528295376121463e-07, "loss": 0.0029, "num_tokens": 30643285.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 904 }, { "completion_length": 1505.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2986.0, "completions/max_terminated_length": 2986.0, "completions/mean_length": 1505.25, "completions/mean_terminated_length": 1505.25, "completions/min_length": 497.0, "completions/min_terminated_length": 497.0, "epoch": 0.30698778833107193, "frac_reward_zero_std": 0.5, "grad_norm": 0.32887357473373413, "kl": 0.0, "learning_rate": 3.5265700483091785e-07, "loss": 0.0141, "num_tokens": 30672184.0, "reward": 0.7666667699813843, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.46666669845581055, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 905 }, { "completion_length": 1593.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3868.0, "completions/max_terminated_length": 3868.0, "completions/mean_length": 1593.0833740234375, "completions/mean_terminated_length": 1593.0833740234375, "completions/min_length": 582.0, "completions/min_terminated_length": 582.0, "epoch": 0.3073270013568521, "frac_reward_zero_std": 0.5, "grad_norm": 0.7148328423500061, "kl": 0.0, "learning_rate": 3.5248447204968945e-07, "loss": -0.0257, "num_tokens": 30698309.0, "reward": 0.7833334803581238, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.4833333194255829, "rewards/correctness_reward_func/std": 0.43029236793518066, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 906 }, { "completion_length": 1990.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4104.0, "completions/max_terminated_length": 4104.0, "completions/mean_length": 1990.8333740234375, "completions/mean_terminated_length": 1990.8333740234375, "completions/min_length": 985.0, "completions/min_terminated_length": 985.0, "epoch": 0.3076662143826323, "frac_reward_zero_std": 0.5, "grad_norm": 0.0934876799583435, "kl": 0.0, "learning_rate": 3.5231193926846095e-07, "loss": -0.0055, "num_tokens": 30735015.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 907 }, { "completion_length": 1272.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3085.0, "completions/max_terminated_length": 3085.0, "completions/mean_length": 1272.166748046875, "completions/mean_terminated_length": 1272.166748046875, "completions/min_length": 672.0, "completions/min_terminated_length": 672.0, "epoch": 0.30800542740841247, "frac_reward_zero_std": 0.5, "grad_norm": 0.0728154107928276, "kl": 0.0, "learning_rate": 3.5213940648723256e-07, "loss": -0.0013, "num_tokens": 30765851.0, "reward": 1.2666666507720947, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 908 }, { "completion_length": 1949.5834350585938, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5665.0, "completions/mean_length": 2498.666748046875, "completions/mean_terminated_length": 2126.818359375, "completions/min_length": 812.0, "completions/min_terminated_length": 812.0, "epoch": 0.3083446404341927, "frac_reward_zero_std": 0.5, "grad_norm": 0.2722416818141937, "kl": NaN, "learning_rate": 3.5196687370600417e-07, "loss": -0.0216, "num_tokens": 30799218.0, "reward": 1.008333444595337, "reward_std": 0.22453658282756805, "rewards/correctness_reward_func/mean": 0.7333333492279053, "rewards/correctness_reward_func/std": 0.23094011843204498, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 909 }, { "completion_length": 2033.3333740234375, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 6482.0, "completions/mean_length": 3131.5, "completions/mean_terminated_length": 2440.0, "completions/min_length": 1208.0, "completions/min_terminated_length": 1208.0, "epoch": 0.30868385345997285, "frac_reward_zero_std": 0.5, "grad_norm": 0.22964391112327576, "kl": NaN, "learning_rate": 3.5179434092477567e-07, "loss": -0.0203, "num_tokens": 30830872.0, "reward": 0.25, "reward_std": 0.0774596780538559, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.11677484214305878, "step": 910 }, { "completion_length": 1930.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3964.0, "completions/max_terminated_length": 3964.0, "completions/mean_length": 1930.0833740234375, "completions/mean_terminated_length": 1930.0833740234375, "completions/min_length": 728.0, "completions/min_terminated_length": 728.0, "epoch": 0.30902306648575306, "frac_reward_zero_std": 0.5, "grad_norm": 0.0788058415055275, "kl": 0.0, "learning_rate": 3.5162180814354727e-07, "loss": 0.0019, "num_tokens": 30866993.0, "reward": 1.2166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298573225736618, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 911 }, { "completion_length": 1843.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 5078.0, "completions/max_terminated_length": 5078.0, "completions/mean_length": 1843.0, "completions/mean_terminated_length": 1843.0, "completions/min_length": 720.0, "completions/min_terminated_length": 720.0, "epoch": 0.3093622795115332, "frac_reward_zero_std": 0.0, "grad_norm": 0.17806807160377502, "kl": 0.0, "learning_rate": 3.514492753623188e-07, "loss": -0.0043, "num_tokens": 30903971.0, "reward": 1.2166666984558105, "reward_std": 0.10641197860240936, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 912 }, { "completion_length": 1276.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1647.0, "completions/max_terminated_length": 1647.0, "completions/mean_length": 1276.75, "completions/mean_terminated_length": 1276.75, "completions/min_length": 797.0, "completions/min_terminated_length": 797.0, "epoch": 0.30970149253731344, "frac_reward_zero_std": 1.0, "grad_norm": 1.0859728405421265e-07, "kl": 0.0, "learning_rate": 3.5127674258109043e-07, "loss": 0.0, "num_tokens": 30936494.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 913 }, { "completion_length": 1329.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2402.0, "completions/max_terminated_length": 2402.0, "completions/mean_length": 1329.75, "completions/mean_terminated_length": 1329.75, "completions/min_length": 508.0, "completions/min_terminated_length": 508.0, "epoch": 0.3100407055630936, "frac_reward_zero_std": 1.0, "grad_norm": 2.0788388610526454e-07, "kl": 0.0, "learning_rate": 3.5110420979986193e-07, "loss": 0.0, "num_tokens": 30962819.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 914 }, { "completion_length": 941.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1229.0, "completions/max_terminated_length": 1229.0, "completions/mean_length": 941.0, "completions/mean_terminated_length": 941.0, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "epoch": 0.3103799185888738, "frac_reward_zero_std": 1.0, "grad_norm": 9.0740662983535e-08, "kl": 0.0, "learning_rate": 3.5093167701863354e-07, "loss": 0.0, "num_tokens": 30982991.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 915 }, { "completion_length": 1847.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2928.0, "completions/max_terminated_length": 2928.0, "completions/mean_length": 1847.5, "completions/mean_terminated_length": 1847.5, "completions/min_length": 1272.0, "completions/min_terminated_length": 1272.0, "epoch": 0.310719131614654, "frac_reward_zero_std": 0.0, "grad_norm": 0.5373735427856445, "kl": 0.0, "learning_rate": 3.507591442374051e-07, "loss": 0.0057, "num_tokens": 31018577.0, "reward": 0.44999998807907104, "reward_std": 0.36742347478866577, "rewards/correctness_reward_func/mean": 0.14999999105930328, "rewards/correctness_reward_func/std": 0.35290998220443726, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 916 }, { "completion_length": 1876.0000610351562, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 5651.0, "completions/mean_length": 2974.166748046875, "completions/mean_terminated_length": 2251.199951171875, "completions/min_length": 555.0, "completions/min_terminated_length": 555.0, "epoch": 0.3110583446404342, "frac_reward_zero_std": 0.0, "grad_norm": 0.8309308290481567, "kl": NaN, "learning_rate": 3.505866114561767e-07, "loss": -0.0607, "num_tokens": 31055465.0, "reward": 0.5833333730697632, "reward_std": 0.4858439564704895, "rewards/correctness_reward_func/mean": 0.3333333432674408, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.2500000298023224, "rewards/format_reward_func/std": 0.11677484214305878, "step": 917 }, { "completion_length": 1174.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2559.0, "completions/max_terminated_length": 2559.0, "completions/mean_length": 1174.916748046875, "completions/mean_terminated_length": 1174.916748046875, "completions/min_length": 688.0, "completions/min_terminated_length": 688.0, "epoch": 0.3113975576662144, "frac_reward_zero_std": 0.5, "grad_norm": 0.10468626022338867, "kl": 0.0, "learning_rate": 3.504140786749482e-07, "loss": -0.0008, "num_tokens": 31078678.0, "reward": 1.1666667461395264, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 918 }, { "completion_length": 1952.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 5511.0, "completions/max_terminated_length": 5511.0, "completions/mean_length": 1952.416748046875, "completions/mean_terminated_length": 1952.416748046875, "completions/min_length": 861.0, "completions/min_terminated_length": 861.0, "epoch": 0.31173677069199457, "frac_reward_zero_std": 0.5, "grad_norm": 0.5677090287208557, "kl": 0.0, "learning_rate": 3.502415458937198e-07, "loss": -0.0326, "num_tokens": 31110243.0, "reward": 0.5166666507720947, "reward_std": 0.24013885855674744, "rewards/correctness_reward_func/mean": 0.21666665375232697, "rewards/correctness_reward_func/std": 0.39504507184028625, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 919 }, { "completion_length": 813.5000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1229.0, "completions/max_terminated_length": 1229.0, "completions/mean_length": 813.5, "completions/mean_terminated_length": 813.5, "completions/min_length": 620.0, "completions/min_terminated_length": 620.0, "epoch": 0.3120759837177748, "frac_reward_zero_std": 0.5, "grad_norm": 0.06546156108379364, "kl": 0.0, "learning_rate": 3.5006901311249135e-07, "loss": 0.001, "num_tokens": 31134165.0, "reward": 0.7833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 920 }, { "completion_length": 829.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2224.0, "completions/max_terminated_length": 2224.0, "completions/mean_length": 829.5, "completions/mean_terminated_length": 829.5, "completions/min_length": 410.0, "completions/min_terminated_length": 410.0, "epoch": 0.31241519674355495, "frac_reward_zero_std": 0.5, "grad_norm": 0.0700530856847763, "kl": 0.0, "learning_rate": 3.498964803312629e-07, "loss": -0.003, "num_tokens": 31155189.0, "reward": 1.2166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 921 }, { "completion_length": 1294.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2378.0, "completions/max_terminated_length": 2378.0, "completions/mean_length": 1294.166748046875, "completions/mean_terminated_length": 1294.166748046875, "completions/min_length": 536.0, "completions/min_terminated_length": 536.0, "epoch": 0.31275440976933516, "frac_reward_zero_std": 0.0, "grad_norm": 0.1272267997264862, "kl": 0.0, "learning_rate": 3.4972394755003446e-07, "loss": -0.002, "num_tokens": 31176545.0, "reward": 1.2000000476837158, "reward_std": 0.08164963126182556, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 922 }, { "completion_length": 1220.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1935.0, "completions/max_terminated_length": 1935.0, "completions/mean_length": 1220.916748046875, "completions/mean_terminated_length": 1220.916748046875, "completions/min_length": 633.0, "completions/min_terminated_length": 633.0, "epoch": 0.3130936227951153, "frac_reward_zero_std": 0.5, "grad_norm": 0.055474903434515, "kl": 0.0, "learning_rate": 3.4955141476880606e-07, "loss": 0.0003, "num_tokens": 31205242.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 923 }, { "completion_length": 1090.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2956.0, "completions/max_terminated_length": 2956.0, "completions/mean_length": 1090.0833740234375, "completions/mean_terminated_length": 1090.0833740234375, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.31343283582089554, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.4937888198757767e-07, "loss": 0.0, "num_tokens": 31232585.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 924 }, { "completion_length": 1138.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4259.0, "completions/max_terminated_length": 4259.0, "completions/mean_length": 1138.3333740234375, "completions/mean_terminated_length": 1138.3333740234375, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 0.3137720488466757, "frac_reward_zero_std": 0.5, "grad_norm": 0.37474891543388367, "kl": 0.0, "learning_rate": 3.4920634920634917e-07, "loss": -0.0011, "num_tokens": 31258053.0, "reward": 1.1166667938232422, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.8166666030883789, "rewards/correctness_reward_func/std": 0.27579087018966675, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 925 }, { "completion_length": 977.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2604.0, "completions/max_terminated_length": 2604.0, "completions/mean_length": 977.25, "completions/mean_terminated_length": 977.25, "completions/min_length": 443.0, "completions/min_terminated_length": 443.0, "epoch": 0.3141112618724559, "frac_reward_zero_std": 1.0, "grad_norm": 9.163004222045856e-08, "kl": 0.0, "learning_rate": 3.490338164251208e-07, "loss": 0.0, "num_tokens": 31285440.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 926 }, { "completion_length": 1147.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 1901.0, "completions/max_terminated_length": 1901.0, "completions/mean_length": 1147.0, "completions/mean_terminated_length": 1147.0, "completions/min_length": 658.0, "completions/min_terminated_length": 658.0, "epoch": 0.3144504748982361, "frac_reward_zero_std": 1.0, "grad_norm": 1.1602305249880374e-07, "kl": 0.0, "learning_rate": 3.4886128364389233e-07, "loss": 0.0, "num_tokens": 31308624.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 927 }, { "completion_length": 745.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1572.0, "completions/max_terminated_length": 1572.0, "completions/mean_length": 745.0833740234375, "completions/mean_terminated_length": 745.0833740234375, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "epoch": 0.3147896879240163, "frac_reward_zero_std": 1.0, "grad_norm": 1.0053457799585885e-07, "kl": 0.0, "learning_rate": 3.4868875086266393e-07, "loss": 0.0, "num_tokens": 31327321.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 928 }, { "completion_length": 883.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1468.0, "completions/max_terminated_length": 1468.0, "completions/mean_length": 883.0, "completions/mean_terminated_length": 883.0, "completions/min_length": 430.0, "completions/min_terminated_length": 430.0, "epoch": 0.31512890094979645, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.4851621808143543e-07, "loss": 0.0, "num_tokens": 31351351.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 929 }, { "completion_length": 1326.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3650.0, "completions/max_terminated_length": 3650.0, "completions/mean_length": 1326.5833740234375, "completions/mean_terminated_length": 1326.5833740234375, "completions/min_length": 441.0, "completions/min_terminated_length": 441.0, "epoch": 0.31546811397557667, "frac_reward_zero_std": 0.5, "grad_norm": 0.2548946440219879, "kl": 0.0, "learning_rate": 3.4834368530020704e-07, "loss": 0.0018, "num_tokens": 31381022.0, "reward": 1.0500000715255737, "reward_std": 0.17606817185878754, "rewards/correctness_reward_func/mean": 0.75, "rewards/correctness_reward_func/std": 0.24308621883392334, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 930 }, { "completion_length": 4050.166748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6140.0, "completions/mean_length": 4599.25, "completions/mean_terminated_length": 4418.36376953125, "completions/min_length": 1481.0, "completions/min_terminated_length": 1481.0, "epoch": 0.31580732700135683, "frac_reward_zero_std": 0.5, "grad_norm": 0.3112356662750244, "kl": NaN, "learning_rate": 3.481711525189786e-07, "loss": -0.0497, "num_tokens": 31438420.0, "reward": 0.6791666746139526, "reward_std": 0.2609677314758301, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.5149286985397339, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.09323723614215851, "step": 931 }, { "completion_length": 1440.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2638.0, "completions/max_terminated_length": 2638.0, "completions/mean_length": 1440.416748046875, "completions/mean_terminated_length": 1440.416748046875, "completions/min_length": 912.0, "completions/min_terminated_length": 912.0, "epoch": 0.31614654002713705, "frac_reward_zero_std": 0.5, "grad_norm": 0.07152718305587769, "kl": 0.0, "learning_rate": 3.4799861973775015e-07, "loss": -0.0004, "num_tokens": 31466607.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 932 }, { "completion_length": 913.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1962.0, "completions/max_terminated_length": 1962.0, "completions/mean_length": 913.1666870117188, "completions/mean_terminated_length": 913.1666870117188, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "epoch": 0.3164857530529172, "frac_reward_zero_std": 0.5, "grad_norm": 0.03750376030802727, "kl": 0.0, "learning_rate": 3.478260869565217e-07, "loss": 0.0, "num_tokens": 31491683.0, "reward": 0.7833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 933 }, { "completion_length": 615.5833587646484, "completions/clipped_ratio": 0.5, "completions/max_length": 6589.0, "completions/max_terminated_length": 1752.0, "completions/mean_length": 3910.08349609375, "completions/mean_terminated_length": 1231.166748046875, "completions/min_length": 780.0, "completions/min_terminated_length": 780.0, "epoch": 0.3168249660786974, "frac_reward_zero_std": 0.5, "grad_norm": 0.052773211151361465, "kl": NaN, "learning_rate": 3.476535541752933e-07, "loss": -0.0011, "num_tokens": 31510014.0, "reward": 0.6333333253860474, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.15000000596046448, "rewards/format_reward_func/std": 0.15666989982128143, "step": 934 }, { "completion_length": 1668.75, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 5946.0, "completions/mean_length": 3316.0, "completions/mean_terminated_length": 2225.0, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "epoch": 0.31716417910447764, "frac_reward_zero_std": 0.0, "grad_norm": 0.8342804312705994, "kl": NaN, "learning_rate": 3.4748102139406486e-07, "loss": -0.1025, "num_tokens": 31543809.0, "reward": 0.845833420753479, "reward_std": 0.5563273429870605, "rewards/correctness_reward_func/mean": 0.6333333253860474, "rewards/correctness_reward_func/std": 0.4735424220561981, "rewards/format_reward_func/mean": 0.21250002086162567, "rewards/format_reward_func/std": 0.13505050539970398, "step": 935 }, { "completion_length": 444.75, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/max_terminated_length": 594.0, "completions/mean_length": 444.75, "completions/mean_terminated_length": 444.75, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.3175033921302578, "frac_reward_zero_std": 0.5, "grad_norm": 0.06688177585601807, "kl": 0.0, "learning_rate": 3.473084886128364e-07, "loss": -0.0, "num_tokens": 31563336.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 936 }, { "completion_length": 1351.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2880.0, "completions/max_terminated_length": 2880.0, "completions/mean_length": 1351.416748046875, "completions/mean_terminated_length": 1351.416748046875, "completions/min_length": 847.0, "completions/min_terminated_length": 847.0, "epoch": 0.317842605156038, "frac_reward_zero_std": 0.5, "grad_norm": 0.477865070104599, "kl": 0.0, "learning_rate": 3.4713595583160796e-07, "loss": 0.0164, "num_tokens": 31591571.0, "reward": 1.2166666984558105, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 937 }, { "completion_length": 2661.166748046875, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 5988.0, "completions/mean_length": 4308.4169921875, "completions/mean_terminated_length": 3548.22216796875, "completions/min_length": 1966.0, "completions/min_terminated_length": 1966.0, "epoch": 0.3181818181818182, "frac_reward_zero_std": 0.5, "grad_norm": 1.4337899684906006, "kl": NaN, "learning_rate": 3.4696342305037957e-07, "loss": -0.0686, "num_tokens": 31636261.0, "reward": 0.7083333730697632, "reward_std": 0.2518266439437866, "rewards/correctness_reward_func/mean": 0.4833333194255829, "rewards/correctness_reward_func/std": 0.43029236793518066, "rewards/format_reward_func/mean": 0.22500000894069672, "rewards/format_reward_func/std": 0.13568010926246643, "step": 938 }, { "completion_length": 688.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1200.0, "completions/max_terminated_length": 1200.0, "completions/mean_length": 688.75, "completions/mean_terminated_length": 688.75, "completions/min_length": 366.0, "completions/min_terminated_length": 366.0, "epoch": 0.3185210312075984, "frac_reward_zero_std": 0.0, "grad_norm": 0.09581376612186432, "kl": 0.0, "learning_rate": 3.467908902691512e-07, "loss": -0.0006, "num_tokens": 31659796.0, "reward": 1.2166666984558105, "reward_std": 0.09246456623077393, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 939 }, { "completion_length": 1968.166748046875, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 5840.0, "completions/mean_length": 3615.416748046875, "completions/mean_terminated_length": 2624.22216796875, "completions/min_length": 1067.0, "completions/min_terminated_length": 1067.0, "epoch": 0.31886024423337855, "frac_reward_zero_std": 0.0, "grad_norm": 0.884489893913269, "kl": NaN, "learning_rate": 3.466183574879227e-07, "loss": -0.1014, "num_tokens": 31698522.0, "reward": 0.5916666984558105, "reward_std": 0.5406736135482788, "rewards/correctness_reward_func/mean": 0.36666667461395264, "rewards/correctness_reward_func/std": 0.45792683959007263, "rewards/format_reward_func/mean": 0.2250000238418579, "rewards/format_reward_func/std": 0.13568010926246643, "step": 940 }, { "completion_length": 1261.1666870117188, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 2849.0, "completions/mean_length": 1810.25, "completions/mean_terminated_length": 1375.8182373046875, "completions/min_length": 785.0, "completions/min_terminated_length": 785.0, "epoch": 0.31919945725915877, "frac_reward_zero_std": 0.5, "grad_norm": 0.24075216054916382, "kl": NaN, "learning_rate": 3.464458247066943e-07, "loss": -0.0164, "num_tokens": 31727606.0, "reward": 1.1083333492279053, "reward_std": 0.22453658282756805, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.2806917726993561, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 941 }, { "completion_length": 985.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1892.0, "completions/max_terminated_length": 1892.0, "completions/mean_length": 985.25, "completions/mean_terminated_length": 985.25, "completions/min_length": 441.0, "completions/min_terminated_length": 441.0, "epoch": 0.31953867028493893, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.4627329192546583e-07, "loss": 0.0, "num_tokens": 31750769.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 942 }, { "completion_length": 887.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1810.0, "completions/max_terminated_length": 1810.0, "completions/mean_length": 887.9166870117188, "completions/mean_terminated_length": 887.9166870117188, "completions/min_length": 538.0, "completions/min_terminated_length": 538.0, "epoch": 0.31987788331071915, "frac_reward_zero_std": 1.0, "grad_norm": 2.1579033671059733e-07, "kl": 0.0, "learning_rate": 3.461007591442374e-07, "loss": 0.0, "num_tokens": 31775662.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 943 }, { "completion_length": 1304.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3547.0, "completions/max_terminated_length": 3547.0, "completions/mean_length": 1304.5, "completions/mean_terminated_length": 1304.5, "completions/min_length": 528.0, "completions/min_terminated_length": 528.0, "epoch": 0.3202170963364993, "frac_reward_zero_std": 1.0, "grad_norm": 2.3239888946591236e-07, "kl": 0.0, "learning_rate": 3.4592822636300894e-07, "loss": 0.0, "num_tokens": 31795600.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 944 }, { "completion_length": 697.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1262.0, "completions/max_terminated_length": 1262.0, "completions/mean_length": 697.75, "completions/mean_terminated_length": 697.75, "completions/min_length": 494.0, "completions/min_terminated_length": 494.0, "epoch": 0.3205563093622795, "frac_reward_zero_std": 0.5, "grad_norm": 0.2733204960823059, "kl": 0.0, "learning_rate": 3.4575569358178054e-07, "loss": -0.0005, "num_tokens": 31818679.0, "reward": 1.2000000476837158, "reward_std": 0.19999998807907104, "rewards/correctness_reward_func/mean": 0.9000000357627869, "rewards/correctness_reward_func/std": 0.28919950127601624, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 945 }, { "completion_length": 2257.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 6290.0, "completions/max_terminated_length": 6290.0, "completions/mean_length": 2257.416748046875, "completions/mean_terminated_length": 2257.416748046875, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 0.3208955223880597, "frac_reward_zero_std": 0.0, "grad_norm": 0.6494872570037842, "kl": 0.0, "learning_rate": 3.455831608005521e-07, "loss": -0.0207, "num_tokens": 31862034.0, "reward": 0.7833333015441895, "reward_std": 0.36742347478866577, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 946 }, { "completion_length": 1796.0000915527344, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4971.0, "completions/mean_length": 2345.08349609375, "completions/mean_terminated_length": 1959.2728271484375, "completions/min_length": 595.0, "completions/min_terminated_length": 595.0, "epoch": 0.3212347354138399, "frac_reward_zero_std": 0.0, "grad_norm": 0.12197453528642654, "kl": NaN, "learning_rate": 3.4541062801932365e-07, "loss": -0.0085, "num_tokens": 31895976.0, "reward": 0.7458333373069763, "reward_std": 0.10710843652486801, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.09323723614215851, "step": 947 }, { "completion_length": 608.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1151.0, "completions/max_terminated_length": 1151.0, "completions/mean_length": 608.1666870117188, "completions/mean_terminated_length": 608.1666870117188, "completions/min_length": 380.0, "completions/min_terminated_length": 380.0, "epoch": 0.32157394843962006, "frac_reward_zero_std": 1.0, "grad_norm": 1.0008302098185595e-07, "kl": 0.0, "learning_rate": 3.452380952380952e-07, "loss": 0.0, "num_tokens": 31916744.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 948 }, { "completion_length": 878.2500610351562, "completions/clipped_ratio": 0.41666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 4383.0, "completions/mean_length": 3623.666748046875, "completions/mean_terminated_length": 1505.571533203125, "completions/min_length": 724.0, "completions/min_terminated_length": 724.0, "epoch": 0.3219131614654003, "frac_reward_zero_std": 0.5, "grad_norm": 0.11442892253398895, "kl": NaN, "learning_rate": 3.450655624568668e-07, "loss": -0.0083, "num_tokens": 31940219.0, "reward": 0.7250000238418579, "reward_std": 0.06708204001188278, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.2250000238418579, "rewards/format_reward_func/std": 0.11965861171483994, "step": 949 }, { "completion_length": 1511.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3411.0, "completions/max_terminated_length": 3411.0, "completions/mean_length": 1511.666748046875, "completions/mean_terminated_length": 1511.666748046875, "completions/min_length": 705.0, "completions/min_terminated_length": 705.0, "epoch": 0.32225237449118044, "frac_reward_zero_std": 1.0, "grad_norm": 1.4038799633908638e-07, "kl": 0.0, "learning_rate": 3.4489302967563836e-07, "loss": 0.0, "num_tokens": 31968571.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 950 }, { "completion_length": 1021.1667175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 2335.0, "completions/max_terminated_length": 2335.0, "completions/mean_length": 1021.1666870117188, "completions/mean_terminated_length": 1021.1666870117188, "completions/min_length": 677.0, "completions/min_terminated_length": 677.0, "epoch": 0.32259158751696065, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.447204968944099e-07, "loss": 0.0, "num_tokens": 31991547.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 951 }, { "completion_length": 739.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1009.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 739.6666870117188, "completions/mean_terminated_length": 739.6666870117188, "completions/min_length": 436.0, "completions/min_terminated_length": 436.0, "epoch": 0.3229308005427408, "frac_reward_zero_std": 1.0, "grad_norm": 1.2268769467027596e-07, "kl": 0.0, "learning_rate": 3.4454796411318147e-07, "loss": 0.0, "num_tokens": 32012159.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 952 }, { "completion_length": 1022.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2877.0, "completions/max_terminated_length": 2877.0, "completions/mean_length": 1022.5, "completions/mean_terminated_length": 1022.5, "completions/min_length": 495.0, "completions/min_terminated_length": 495.0, "epoch": 0.32327001356852103, "frac_reward_zero_std": 0.5, "grad_norm": 0.05467428267002106, "kl": 0.0, "learning_rate": 3.4437543133195307e-07, "loss": -0.0002, "num_tokens": 32035457.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 953 }, { "completion_length": 2420.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 5646.0, "completions/max_terminated_length": 5646.0, "completions/mean_length": 2420.416748046875, "completions/mean_terminated_length": 2420.416748046875, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "epoch": 0.32360922659430125, "frac_reward_zero_std": 0.5, "grad_norm": 0.4149426221847534, "kl": 0.0, "learning_rate": 3.4420289855072457e-07, "loss": -0.0127, "num_tokens": 32074594.0, "reward": 0.7541667819023132, "reward_std": 0.13268069922924042, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 954 }, { "completion_length": 832.0833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1620.0, "completions/max_terminated_length": 1620.0, "completions/mean_length": 832.0833740234375, "completions/mean_terminated_length": 832.0833740234375, "completions/min_length": 480.0, "completions/min_terminated_length": 480.0, "epoch": 0.3239484396200814, "frac_reward_zero_std": 0.5, "grad_norm": 0.08572785556316376, "kl": 0.0, "learning_rate": 3.440303657694962e-07, "loss": 0.0007, "num_tokens": 32092451.0, "reward": 1.2666666507720947, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 955 }, { "completion_length": 1114.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1610.0, "completions/max_terminated_length": 1610.0, "completions/mean_length": 1114.416748046875, "completions/mean_terminated_length": 1114.416748046875, "completions/min_length": 749.0, "completions/min_terminated_length": 749.0, "epoch": 0.3242876526458616, "frac_reward_zero_std": 1.0, "grad_norm": 2.0851454962667049e-07, "kl": 0.0, "learning_rate": 3.438578329882678e-07, "loss": 0.0, "num_tokens": 32116522.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 956 }, { "completion_length": 2551.08349609375, "completions/clipped_ratio": 0.0, "completions/max_length": 3836.0, "completions/max_terminated_length": 3836.0, "completions/mean_length": 2551.08349609375, "completions/mean_terminated_length": 2551.08349609375, "completions/min_length": 1350.0, "completions/min_terminated_length": 1350.0, "epoch": 0.3246268656716418, "frac_reward_zero_std": 0.0, "grad_norm": 0.6476059556007385, "kl": 0.0, "learning_rate": 3.4368530020703934e-07, "loss": -0.0199, "num_tokens": 32160011.0, "reward": 0.7666666507720947, "reward_std": 0.36329931020736694, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 957 }, { "completion_length": 2170.5834350585938, "completions/clipped_ratio": 0.0, "completions/max_length": 4740.0, "completions/max_terminated_length": 4740.0, "completions/mean_length": 2170.58349609375, "completions/mean_terminated_length": 2170.58349609375, "completions/min_length": 642.0, "completions/min_terminated_length": 642.0, "epoch": 0.324966078697422, "frac_reward_zero_std": 0.5, "grad_norm": 0.45245659351348877, "kl": 0.0, "learning_rate": 3.435127674258109e-07, "loss": -0.0042, "num_tokens": 32202528.0, "reward": 0.7666667699813843, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 958 }, { "completion_length": 1702.166748046875, "completions/clipped_ratio": 0.33333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4745.0, "completions/mean_length": 3898.5, "completions/mean_terminated_length": 2553.25, "completions/min_length": 811.0, "completions/min_terminated_length": 811.0, "epoch": 0.32530529172320216, "frac_reward_zero_std": 0.0, "grad_norm": 0.7316205501556396, "kl": NaN, "learning_rate": 3.4334023464458244e-07, "loss": -0.0585, "num_tokens": 32236376.0, "reward": 0.8000000715255737, "reward_std": 0.33565855026245117, "rewards/correctness_reward_func/mean": 0.5999999642372131, "rewards/correctness_reward_func/std": 0.45126086473464966, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.14770980179309845, "step": 959 }, { "completion_length": 1313.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3487.0, "completions/max_terminated_length": 3487.0, "completions/mean_length": 1313.0833740234375, "completions/mean_terminated_length": 1313.0833740234375, "completions/min_length": 599.0, "completions/min_terminated_length": 599.0, "epoch": 0.3256445047489824, "frac_reward_zero_std": 0.0, "grad_norm": 0.10751835256814957, "kl": 0.0, "learning_rate": 3.4316770186335405e-07, "loss": 0.001, "num_tokens": 32263551.0, "reward": 1.183333396911621, "reward_std": 0.09246458858251572, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298573225736618, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 960 }, { "completion_length": 672.5000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 919.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 672.5, "completions/mean_terminated_length": 672.5, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "epoch": 0.32598371777476254, "frac_reward_zero_std": 1.0, "grad_norm": 1.0762104096784242e-07, "kl": 0.0, "learning_rate": 3.429951690821256e-07, "loss": 0.0, "num_tokens": 32283729.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 961 }, { "completion_length": 1622.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2902.0, "completions/max_terminated_length": 2902.0, "completions/mean_length": 1622.75, "completions/mean_terminated_length": 1622.75, "completions/min_length": 470.0, "completions/min_terminated_length": 470.0, "epoch": 0.32632293080054275, "frac_reward_zero_std": 0.5, "grad_norm": 0.08299823850393295, "kl": 0.0, "learning_rate": 3.4282263630089715e-07, "loss": -0.0008, "num_tokens": 32315124.0, "reward": 1.1500000953674316, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.8499999642372131, "rewards/correctness_reward_func/std": 0.09045340120792389, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 962 }, { "completion_length": 2098.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3588.0, "completions/max_terminated_length": 3588.0, "completions/mean_length": 2098.08349609375, "completions/mean_terminated_length": 2098.08349609375, "completions/min_length": 842.0, "completions/min_terminated_length": 842.0, "epoch": 0.3266621438263229, "frac_reward_zero_std": 0.0, "grad_norm": 0.5058161616325378, "kl": 0.0, "learning_rate": 3.426501035196687e-07, "loss": -0.0014, "num_tokens": 32354557.0, "reward": 0.8500000834465027, "reward_std": 0.2557638883590698, "rewards/correctness_reward_func/mean": 0.550000011920929, "rewards/correctness_reward_func/std": 0.49082493782043457, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 963 }, { "completion_length": 1694.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3257.0, "completions/max_terminated_length": 3257.0, "completions/mean_length": 1694.75, "completions/mean_terminated_length": 1694.75, "completions/min_length": 557.0, "completions/min_terminated_length": 557.0, "epoch": 0.32700135685210313, "frac_reward_zero_std": 0.5, "grad_norm": 0.06396138668060303, "kl": 0.0, "learning_rate": 3.424775707384403e-07, "loss": 0.0003, "num_tokens": 32387842.0, "reward": 0.7666666507720947, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 964 }, { "completion_length": 1904.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 6139.0, "completions/max_terminated_length": 6139.0, "completions/mean_length": 1904.8333740234375, "completions/mean_terminated_length": 1904.8333740234375, "completions/min_length": 552.0, "completions/min_terminated_length": 552.0, "epoch": 0.3273405698778833, "frac_reward_zero_std": 0.5, "grad_norm": 0.1909196823835373, "kl": 0.0, "learning_rate": 3.423050379572118e-07, "loss": 0.0087, "num_tokens": 32422652.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 965 }, { "completion_length": 1142.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2871.0, "completions/max_terminated_length": 2871.0, "completions/mean_length": 1142.416748046875, "completions/mean_terminated_length": 1142.416748046875, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 0.3276797829036635, "frac_reward_zero_std": 1.0, "grad_norm": 3.0000242645655817e-07, "kl": 0.0, "learning_rate": 3.421325051759834e-07, "loss": 0.0, "num_tokens": 32446513.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 966 }, { "completion_length": 2232.5834350585938, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5776.0, "completions/mean_length": 2781.666748046875, "completions/mean_terminated_length": 2435.54541015625, "completions/min_length": 844.0, "completions/min_terminated_length": 844.0, "epoch": 0.32801899592944367, "frac_reward_zero_std": 0.5, "grad_norm": 0.3459160327911377, "kl": NaN, "learning_rate": 3.4195997239475497e-07, "loss": -0.0268, "num_tokens": 32487488.0, "reward": 1.1541666984558105, "reward_std": 0.21588000655174255, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.287096232175827, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 967 }, { "completion_length": 1698.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 4551.0, "completions/max_terminated_length": 4551.0, "completions/mean_length": 1698.416748046875, "completions/mean_terminated_length": 1698.416748046875, "completions/min_length": 487.0, "completions/min_terminated_length": 487.0, "epoch": 0.3283582089552239, "frac_reward_zero_std": 0.0, "grad_norm": 0.6486458778381348, "kl": 0.0, "learning_rate": 3.417874396135266e-07, "loss": 0.0035, "num_tokens": 32516797.0, "reward": 1.0500000715255737, "reward_std": 0.299967497587204, "rewards/correctness_reward_func/mean": 0.75, "rewards/correctness_reward_func/std": 0.36306774616241455, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 968 }, { "completion_length": 1960.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4384.0, "completions/max_terminated_length": 4384.0, "completions/mean_length": 1960.3333740234375, "completions/mean_terminated_length": 1960.3333740234375, "completions/min_length": 522.0, "completions/min_terminated_length": 522.0, "epoch": 0.32869742198100405, "frac_reward_zero_std": 0.5, "grad_norm": 0.4522709548473358, "kl": 0.0, "learning_rate": 3.416149068322981e-07, "loss": -0.0257, "num_tokens": 32551721.0, "reward": 0.38333332538604736, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.0833333358168602, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 969 }, { "completion_length": 1291.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 2350.0, "completions/max_terminated_length": 2350.0, "completions/mean_length": 1291.416748046875, "completions/mean_terminated_length": 1291.416748046875, "completions/min_length": 583.0, "completions/min_terminated_length": 583.0, "epoch": 0.32903663500678426, "frac_reward_zero_std": 0.5, "grad_norm": 0.5910433530807495, "kl": 0.0, "learning_rate": 3.414423740510697e-07, "loss": 0.0022, "num_tokens": 32579584.0, "reward": 1.0333335399627686, "reward_std": 0.25819891691207886, "rewards/correctness_reward_func/mean": 0.7333332896232605, "rewards/correctness_reward_func/std": 0.3550501763820648, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 970 }, { "completion_length": 2047.666748046875, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 5423.0, "completions/mean_length": 3694.916748046875, "completions/mean_terminated_length": 2730.22216796875, "completions/min_length": 394.0, "completions/min_terminated_length": 394.0, "epoch": 0.3293758480325645, "frac_reward_zero_std": 0.0, "grad_norm": 0.6515673398971558, "kl": NaN, "learning_rate": 3.412698412698413e-07, "loss": 0.0215, "num_tokens": 32614278.0, "reward": 0.3916666507720947, "reward_std": 0.34035724401474, "rewards/correctness_reward_func/mean": 0.1666666716337204, "rewards/correctness_reward_func/std": 0.3892494738101959, "rewards/format_reward_func/mean": 0.22500000894069672, "rewards/format_reward_func/std": 0.13568010926246643, "step": 971 }, { "completion_length": 1830.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4031.0, "completions/max_terminated_length": 4031.0, "completions/mean_length": 1830.3333740234375, "completions/mean_terminated_length": 1830.3333740234375, "completions/min_length": 1033.0, "completions/min_terminated_length": 1033.0, "epoch": 0.32971506105834464, "frac_reward_zero_std": 0.0, "grad_norm": 0.1503661572933197, "kl": 0.0, "learning_rate": 3.4109730848861284e-07, "loss": 0.0047, "num_tokens": 32649274.0, "reward": 1.2166666984558105, "reward_std": 0.10641199350357056, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 972 }, { "completion_length": 797.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1650.0, "completions/max_terminated_length": 1650.0, "completions/mean_length": 797.8333740234375, "completions/mean_terminated_length": 797.8333740234375, "completions/min_length": 454.0, "completions/min_terminated_length": 454.0, "epoch": 0.33005427408412485, "frac_reward_zero_std": 1.0, "grad_norm": 2.0924046850723244e-07, "kl": 0.0, "learning_rate": 3.409247757073844e-07, "loss": 0.0, "num_tokens": 32672906.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 973 }, { "completion_length": 1648.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4696.0, "completions/max_terminated_length": 4696.0, "completions/mean_length": 1648.0, "completions/mean_terminated_length": 1648.0, "completions/min_length": 584.0, "completions/min_terminated_length": 584.0, "epoch": 0.330393487109905, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.4075224292615595e-07, "loss": 0.0, "num_tokens": 32709692.0, "reward": 0.30000001192092896, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 974 }, { "completion_length": 795.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1302.0, "completions/max_terminated_length": 1302.0, "completions/mean_length": 795.8333740234375, "completions/mean_terminated_length": 795.8333740234375, "completions/min_length": 529.0, "completions/min_terminated_length": 529.0, "epoch": 0.33073270013568523, "frac_reward_zero_std": 1.0, "grad_norm": 8.773958626306921e-08, "kl": 0.0, "learning_rate": 3.4057971014492755e-07, "loss": 0.0, "num_tokens": 32731848.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 975 }, { "completion_length": 2053.666748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5526.0, "completions/mean_length": 2602.75, "completions/mean_terminated_length": 2240.36376953125, "completions/min_length": 801.0, "completions/min_terminated_length": 801.0, "epoch": 0.3310719131614654, "frac_reward_zero_std": 0.0, "grad_norm": 0.41066351532936096, "kl": NaN, "learning_rate": 3.4040717736369905e-07, "loss": 0.0118, "num_tokens": 32770622.0, "reward": 0.8583333492279053, "reward_std": 0.28804606199264526, "rewards/correctness_reward_func/mean": 0.5833333134651184, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 976 }, { "completion_length": 4053.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 6396.0, "completions/max_terminated_length": 6396.0, "completions/mean_length": 4053.58349609375, "completions/mean_terminated_length": 4053.58349609375, "completions/min_length": 2075.0, "completions/min_terminated_length": 2075.0, "epoch": 0.3314111261872456, "frac_reward_zero_std": 1.0, "grad_norm": 2.8918020689161494e-07, "kl": 0.0, "learning_rate": 3.4023464458247066e-07, "loss": 0.0, "num_tokens": 32829225.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 977 }, { "completion_length": 1234.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4258.0, "completions/max_terminated_length": 4258.0, "completions/mean_length": 1234.166748046875, "completions/mean_terminated_length": 1234.166748046875, "completions/min_length": 605.0, "completions/min_terminated_length": 605.0, "epoch": 0.33175033921302577, "frac_reward_zero_std": 0.5, "grad_norm": 0.6808133125305176, "kl": 0.0, "learning_rate": 3.400621118012422e-07, "loss": 0.026, "num_tokens": 32852561.0, "reward": 1.133333444595337, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.2806917726993561, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 978 }, { "completion_length": 2129.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3540.0, "completions/max_terminated_length": 3540.0, "completions/mean_length": 2129.416748046875, "completions/mean_terminated_length": 2129.416748046875, "completions/min_length": 742.0, "completions/min_terminated_length": 742.0, "epoch": 0.332089552238806, "frac_reward_zero_std": 1.0, "grad_norm": 2.425616116852325e-07, "kl": 0.0, "learning_rate": 3.398895790200138e-07, "loss": 0.0, "num_tokens": 32892274.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 979 }, { "completion_length": 1461.0833740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 3444.0, "completions/mean_length": 2010.166748046875, "completions/mean_terminated_length": 1593.9091796875, "completions/min_length": 801.0, "completions/min_terminated_length": 801.0, "epoch": 0.33242876526458615, "frac_reward_zero_std": 0.5, "grad_norm": 0.22468790411949158, "kl": NaN, "learning_rate": 3.397170462387853e-07, "loss": -0.0282, "num_tokens": 32921297.0, "reward": 1.2041666507720947, "reward_std": 0.23474274575710297, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 980 }, { "completion_length": 2105.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 6066.0, "completions/max_terminated_length": 6066.0, "completions/mean_length": 2105.416748046875, "completions/mean_terminated_length": 2105.416748046875, "completions/min_length": 554.0, "completions/min_terminated_length": 554.0, "epoch": 0.33276797829036636, "frac_reward_zero_std": 1.0, "grad_norm": 2.466020134761493e-07, "kl": 0.0, "learning_rate": 3.395445134575569e-07, "loss": 0.0, "num_tokens": 32956858.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 981 }, { "completion_length": 1458.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3091.0, "completions/max_terminated_length": 3091.0, "completions/mean_length": 1458.416748046875, "completions/mean_terminated_length": 1458.416748046875, "completions/min_length": 540.0, "completions/min_terminated_length": 540.0, "epoch": 0.3331071913161465, "frac_reward_zero_std": 1.0, "grad_norm": 1.5790541851856688e-07, "kl": 0.0, "learning_rate": 3.393719806763285e-07, "loss": 0.0, "num_tokens": 32984493.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 982 }, { "completion_length": 1025.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1851.0, "completions/max_terminated_length": 1851.0, "completions/mean_length": 1025.666748046875, "completions/mean_terminated_length": 1025.666748046875, "completions/min_length": 477.0, "completions/min_terminated_length": 477.0, "epoch": 0.33344640434192674, "frac_reward_zero_std": 0.5, "grad_norm": 0.08788394927978516, "kl": 0.0, "learning_rate": 3.391994478951001e-07, "loss": 0.0007, "num_tokens": 33010637.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 983 }, { "completion_length": 2536.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4205.0, "completions/max_terminated_length": 4205.0, "completions/mean_length": 2536.416748046875, "completions/mean_terminated_length": 2536.416748046875, "completions/min_length": 1341.0, "completions/min_terminated_length": 1341.0, "epoch": 0.3337856173677069, "frac_reward_zero_std": 0.5, "grad_norm": 0.15756388008594513, "kl": 0.0, "learning_rate": 3.390269151138716e-07, "loss": -0.0005, "num_tokens": 33048778.0, "reward": 0.7333333492279053, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.43333330750465393, "rewards/correctness_reward_func/std": 0.45792683959007263, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 984 }, { "completion_length": 832.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1372.0, "completions/max_terminated_length": 1372.0, "completions/mean_length": 832.4166870117188, "completions/mean_terminated_length": 832.4166870117188, "completions/min_length": 550.0, "completions/min_terminated_length": 550.0, "epoch": 0.3341248303934871, "frac_reward_zero_std": 0.5, "grad_norm": 0.04834370315074921, "kl": 0.0, "learning_rate": 3.388543823326432e-07, "loss": -0.0004, "num_tokens": 33074493.0, "reward": 0.7833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 985 }, { "completion_length": 1010.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1571.0, "completions/max_terminated_length": 1571.0, "completions/mean_length": 1010.5833740234375, "completions/mean_terminated_length": 1010.5833740234375, "completions/min_length": 518.0, "completions/min_terminated_length": 518.0, "epoch": 0.3344640434192673, "frac_reward_zero_std": 1.0, "grad_norm": 1.7334615165509604e-07, "kl": 0.0, "learning_rate": 3.386818495514148e-07, "loss": 0.0, "num_tokens": 33099076.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 986 }, { "completion_length": 1127.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3103.0, "completions/max_terminated_length": 3103.0, "completions/mean_length": 1127.166748046875, "completions/mean_terminated_length": 1127.166748046875, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "epoch": 0.3348032564450475, "frac_reward_zero_std": 0.5, "grad_norm": 0.3109779357910156, "kl": 0.0, "learning_rate": 3.385093167701863e-07, "loss": -0.0022, "num_tokens": 33124332.0, "reward": 0.7666667699813843, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.46666669845581055, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 987 }, { "completion_length": 1063.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 2977.0, "completions/max_terminated_length": 2977.0, "completions/mean_length": 1063.75, "completions/mean_terminated_length": 1063.75, "completions/min_length": 464.0, "completions/min_terminated_length": 464.0, "epoch": 0.33514246947082765, "frac_reward_zero_std": 0.5, "grad_norm": 0.540890634059906, "kl": 0.0, "learning_rate": 3.383367839889579e-07, "loss": 0.0257, "num_tokens": 33149661.0, "reward": 1.1166666746139526, "reward_std": 0.24832773208618164, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.3857303261756897, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 988 }, { "completion_length": 995.5833435058594, "completions/clipped_ratio": 0.5, "completions/max_length": 6589.0, "completions/max_terminated_length": 3265.0, "completions/mean_length": 4290.08349609375, "completions/mean_terminated_length": 1991.166748046875, "completions/min_length": 521.0, "completions/min_terminated_length": 521.0, "epoch": 0.33548168249660787, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": NaN, "learning_rate": 3.3816425120772945e-07, "loss": 0.0, "num_tokens": 33172666.0, "reward": 0.6499999761581421, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.15000000596046448, "rewards/format_reward_func/std": 0.15666989982128143, "step": 989 }, { "completion_length": 1330.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2185.0, "completions/max_terminated_length": 2185.0, "completions/mean_length": 1330.916748046875, "completions/mean_terminated_length": 1330.916748046875, "completions/min_length": 842.0, "completions/min_terminated_length": 842.0, "epoch": 0.3358208955223881, "frac_reward_zero_std": 0.5, "grad_norm": 0.36015501618385315, "kl": 0.0, "learning_rate": 3.3799171842650106e-07, "loss": 0.0031, "num_tokens": 33200667.0, "reward": 1.1166667938232422, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.8166666030883789, "rewards/correctness_reward_func/std": 0.27579087018966675, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 990 }, { "completion_length": 1213.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 2447.0, "completions/max_terminated_length": 2447.0, "completions/mean_length": 1213.75, "completions/mean_terminated_length": 1213.75, "completions/min_length": 768.0, "completions/min_terminated_length": 768.0, "epoch": 0.33616010854816825, "frac_reward_zero_std": 1.0, "grad_norm": 1.6253594026238716e-07, "kl": 0.0, "learning_rate": 3.3781918564527256e-07, "loss": 0.0, "num_tokens": 33226422.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 991 }, { "completion_length": 1514.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3412.0, "completions/max_terminated_length": 3412.0, "completions/mean_length": 1514.75, "completions/mean_terminated_length": 1514.75, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 0.33649932157394846, "frac_reward_zero_std": 1.0, "grad_norm": 1.840590471147152e-07, "kl": 0.0, "learning_rate": 3.3764665286404416e-07, "loss": 0.0, "num_tokens": 33257235.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 992 }, { "completion_length": 794.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1140.0, "completions/max_terminated_length": 1140.0, "completions/mean_length": 794.75, "completions/mean_terminated_length": 794.75, "completions/min_length": 478.0, "completions/min_terminated_length": 478.0, "epoch": 0.3368385345997286, "frac_reward_zero_std": 0.5, "grad_norm": 0.06748513877391815, "kl": 0.0, "learning_rate": 3.374741200828157e-07, "loss": 0.0, "num_tokens": 33277566.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 993 }, { "completion_length": 610.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1054.0, "completions/max_terminated_length": 1054.0, "completions/mean_length": 610.9166870117188, "completions/mean_terminated_length": 610.9166870117188, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "epoch": 0.33717774762550884, "frac_reward_zero_std": 0.5, "grad_norm": 0.049711357802152634, "kl": 0.0, "learning_rate": 3.373015873015873e-07, "loss": 0.0005, "num_tokens": 33298025.0, "reward": 1.2166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298573225736618, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 994 }, { "completion_length": 1622.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 4141.0, "completions/max_terminated_length": 4141.0, "completions/mean_length": 1622.416748046875, "completions/mean_terminated_length": 1622.416748046875, "completions/min_length": 766.0, "completions/min_terminated_length": 766.0, "epoch": 0.337516960651289, "frac_reward_zero_std": 0.5, "grad_norm": 0.4773035943508148, "kl": 0.0, "learning_rate": 3.371290545203588e-07, "loss": 0.0057, "num_tokens": 33325894.0, "reward": 1.1000001430511475, "reward_std": 0.20000001788139343, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.2696799635887146, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 995 }, { "completion_length": 787.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1782.0, "completions/max_terminated_length": 1782.0, "completions/mean_length": 787.25, "completions/mean_terminated_length": 787.25, "completions/min_length": 481.0, "completions/min_terminated_length": 481.0, "epoch": 0.3378561736770692, "frac_reward_zero_std": 1.0, "grad_norm": 9.43017894883269e-08, "kl": 0.0, "learning_rate": 3.369565217391304e-07, "loss": 0.0, "num_tokens": 33346219.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 996 }, { "completion_length": 887.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1226.0, "completions/max_terminated_length": 1226.0, "completions/mean_length": 887.0833740234375, "completions/mean_terminated_length": 887.0833740234375, "completions/min_length": 589.0, "completions/min_terminated_length": 589.0, "epoch": 0.3381953867028494, "frac_reward_zero_std": 1.0, "grad_norm": 1.7791666095945402e-07, "kl": 0.0, "learning_rate": 3.36783988957902e-07, "loss": 0.0, "num_tokens": 33373244.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 997 }, { "completion_length": 894.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1343.0, "completions/max_terminated_length": 1343.0, "completions/mean_length": 894.1666870117188, "completions/mean_terminated_length": 894.1666870117188, "completions/min_length": 571.0, "completions/min_terminated_length": 571.0, "epoch": 0.3385345997286296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.3661145617667353e-07, "loss": 0.0, "num_tokens": 33394702.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 998 }, { "completion_length": 802.0833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1790.0, "completions/max_terminated_length": 1790.0, "completions/mean_length": 802.0833740234375, "completions/mean_terminated_length": 802.0833740234375, "completions/min_length": 472.0, "completions/min_terminated_length": 472.0, "epoch": 0.33887381275440975, "frac_reward_zero_std": 0.0, "grad_norm": 0.1298748254776001, "kl": 0.0, "learning_rate": 3.364389233954451e-07, "loss": 0.0004, "num_tokens": 33415397.0, "reward": 1.2333333492279053, "reward_std": 0.10327950119972229, "rewards/correctness_reward_func/mean": 0.9333333373069763, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 999 }, { "completion_length": 594.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 735.0, "completions/max_terminated_length": 735.0, "completions/mean_length": 594.8333740234375, "completions/mean_terminated_length": 594.8333740234375, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 0.33921302578018997, "frac_reward_zero_std": 1.0, "grad_norm": 1.740247910220205e-07, "kl": 0.0, "learning_rate": 3.362663906142167e-07, "loss": 0.0, "num_tokens": 33435015.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1000 }, { "completion_length": 871.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1417.0, "completions/max_terminated_length": 1417.0, "completions/mean_length": 871.1666870117188, "completions/mean_terminated_length": 871.1666870117188, "completions/min_length": 537.0, "completions/min_terminated_length": 537.0, "epoch": 0.33955223880597013, "frac_reward_zero_std": 1.0, "grad_norm": 1.2006985627976974e-07, "kl": 0.0, "learning_rate": 3.360938578329883e-07, "loss": 0.0, "num_tokens": 33459935.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1001 }, { "completion_length": 854.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1574.0, "completions/max_terminated_length": 1574.0, "completions/mean_length": 854.25, "completions/mean_terminated_length": 854.25, "completions/min_length": 478.0, "completions/min_terminated_length": 478.0, "epoch": 0.33989145183175035, "frac_reward_zero_std": 0.5, "grad_norm": 0.07827668637037277, "kl": 0.0, "learning_rate": 3.359213250517598e-07, "loss": 0.0001, "num_tokens": 33487334.0, "reward": 1.133333444595337, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1002 }, { "completion_length": 1803.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5887.0, "completions/max_terminated_length": 5887.0, "completions/mean_length": 1803.916748046875, "completions/mean_terminated_length": 1803.916748046875, "completions/min_length": 712.0, "completions/min_terminated_length": 712.0, "epoch": 0.3402306648575305, "frac_reward_zero_std": 1.0, "grad_norm": 2.5122810143329843e-07, "kl": 0.0, "learning_rate": 3.357487922705314e-07, "loss": 0.0, "num_tokens": 33525187.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1003 }, { "completion_length": 2457.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 5445.0, "completions/max_terminated_length": 5445.0, "completions/mean_length": 2457.58349609375, "completions/mean_terminated_length": 2457.58349609375, "completions/min_length": 1268.0, "completions/min_terminated_length": 1268.0, "epoch": 0.3405698778833107, "frac_reward_zero_std": 0.5, "grad_norm": 0.13635294139385223, "kl": 0.0, "learning_rate": 3.3557625948930295e-07, "loss": -0.0042, "num_tokens": 33564194.0, "reward": 1.1500000953674316, "reward_std": 0.05477222427725792, "rewards/correctness_reward_func/mean": 0.8500000834465027, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1004 }, { "completion_length": 989.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1766.0, "completions/max_terminated_length": 1766.0, "completions/mean_length": 989.75, "completions/mean_terminated_length": 989.75, "completions/min_length": 619.0, "completions/min_terminated_length": 619.0, "epoch": 0.3409090909090909, "frac_reward_zero_std": 0.5, "grad_norm": 0.08584901690483093, "kl": 0.0, "learning_rate": 3.3540372670807456e-07, "loss": -0.0013, "num_tokens": 33587783.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1005 }, { "completion_length": 1187.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2855.0, "completions/max_terminated_length": 2855.0, "completions/mean_length": 1187.25, "completions/mean_terminated_length": 1187.25, "completions/min_length": 498.0, "completions/min_terminated_length": 498.0, "epoch": 0.3412483039348711, "frac_reward_zero_std": 0.5, "grad_norm": 0.7561020851135254, "kl": 0.0, "learning_rate": 3.3523119392684606e-07, "loss": 0.0045, "num_tokens": 33612560.0, "reward": 1.066666841506958, "reward_std": 0.18618986010551453, "rewards/correctness_reward_func/mean": 0.7666667103767395, "rewards/correctness_reward_func/std": 0.2534608840942383, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1006 }, { "completion_length": 1389.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2045.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1389.0833740234375, "completions/mean_terminated_length": 1389.0833740234375, "completions/min_length": 825.0, "completions/min_terminated_length": 825.0, "epoch": 0.3415875169606513, "frac_reward_zero_std": 0.5, "grad_norm": 0.10171062499284744, "kl": 0.0, "learning_rate": 3.3505866114561767e-07, "loss": -0.0024, "num_tokens": 33641367.0, "reward": 1.133333444595337, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1007 }, { "completion_length": 891.0000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1409.0, "completions/max_terminated_length": 1409.0, "completions/mean_length": 891.0, "completions/mean_terminated_length": 891.0, "completions/min_length": 428.0, "completions/min_terminated_length": 428.0, "epoch": 0.3419267299864315, "frac_reward_zero_std": 1.0, "grad_norm": 1.1641560604402912e-07, "kl": 0.0, "learning_rate": 3.348861283643892e-07, "loss": 0.0, "num_tokens": 33665415.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1008 }, { "completion_length": 945.3333740234375, "completions/clipped_ratio": 0.41666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 3945.0, "completions/mean_length": 3690.75, "completions/mean_terminated_length": 1620.571533203125, "completions/min_length": 547.0, "completions/min_terminated_length": 547.0, "epoch": 0.3422659430122117, "frac_reward_zero_std": 0.0, "grad_norm": 0.6904885768890381, "kl": NaN, "learning_rate": 3.347135955831608e-07, "loss": -0.0427, "num_tokens": 33687445.0, "reward": 0.6750000715255737, "reward_std": 0.2761763334274292, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.4472135901451111, "rewards/format_reward_func/mean": 0.17500001192092896, "rewards/format_reward_func/std": 0.15447859466075897, "step": 1009 }, { "completion_length": 1087.25, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 1864.0, "completions/mean_length": 1636.3333740234375, "completions/mean_terminated_length": 1186.0909423828125, "completions/min_length": 776.0, "completions/min_terminated_length": 776.0, "epoch": 0.34260515603799185, "frac_reward_zero_std": 0.0, "grad_norm": 0.3531261682510376, "kl": NaN, "learning_rate": 3.345410628019323e-07, "loss": -0.0168, "num_tokens": 33712192.0, "reward": 1.004166841506958, "reward_std": 0.2976078391075134, "rewards/correctness_reward_func/mean": 0.7166666984558105, "rewards/correctness_reward_func/std": 0.3459725081920624, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 1010 }, { "completion_length": 1607.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3756.0, "completions/max_terminated_length": 3756.0, "completions/mean_length": 1607.5833740234375, "completions/mean_terminated_length": 1607.5833740234375, "completions/min_length": 488.0, "completions/min_terminated_length": 488.0, "epoch": 0.34294436906377207, "frac_reward_zero_std": 0.5, "grad_norm": 0.13025231659412384, "kl": 0.0, "learning_rate": 3.3436853002070393e-07, "loss": 0.0002, "num_tokens": 33744017.0, "reward": 1.25, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1011 }, { "completion_length": 2006.0001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 3738.0, "completions/max_terminated_length": 3738.0, "completions/mean_length": 2006.0, "completions/mean_terminated_length": 2006.0, "completions/min_length": 815.0, "completions/min_terminated_length": 815.0, "epoch": 0.34328358208955223, "frac_reward_zero_std": 1.0, "grad_norm": 1.3713541591187095e-07, "kl": 0.0, "learning_rate": 3.341959972394755e-07, "loss": 0.0, "num_tokens": 33781289.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1012 }, { "completion_length": 2606.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4444.0, "completions/max_terminated_length": 4444.0, "completions/mean_length": 2606.416748046875, "completions/mean_terminated_length": 2606.416748046875, "completions/min_length": 1472.0, "completions/min_terminated_length": 1472.0, "epoch": 0.34362279511533245, "frac_reward_zero_std": 0.0, "grad_norm": 0.16722708940505981, "kl": 0.0, "learning_rate": 3.3402346445824704e-07, "loss": -0.0023, "num_tokens": 33827110.0, "reward": 1.2000000476837158, "reward_std": 0.08164963126182556, "rewards/correctness_reward_func/mean": 0.9000000357627869, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1013 }, { "completion_length": 1144.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3448.0, "completions/max_terminated_length": 3448.0, "completions/mean_length": 1144.916748046875, "completions/mean_terminated_length": 1144.916748046875, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "epoch": 0.3439620081411126, "frac_reward_zero_std": 0.5, "grad_norm": 0.5857581496238708, "kl": 0.0, "learning_rate": 3.338509316770186e-07, "loss": 0.022, "num_tokens": 33854511.0, "reward": 1.0166666507720947, "reward_std": 0.24013885855674744, "rewards/correctness_reward_func/mean": 0.7166666984558105, "rewards/correctness_reward_func/std": 0.4386618733406067, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1014 }, { "completion_length": 1121.0000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1750.0, "completions/max_terminated_length": 1750.0, "completions/mean_length": 1121.0, "completions/mean_terminated_length": 1121.0, "completions/min_length": 602.0, "completions/min_terminated_length": 602.0, "epoch": 0.3443012211668928, "frac_reward_zero_std": 1.0, "grad_norm": 1.0061319954957071e-07, "kl": 0.0, "learning_rate": 3.336783988957902e-07, "loss": 0.0, "num_tokens": 33876807.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1015 }, { "completion_length": 768.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1217.0, "completions/max_terminated_length": 1217.0, "completions/mean_length": 768.25, "completions/mean_terminated_length": 768.25, "completions/min_length": 453.0, "completions/min_terminated_length": 453.0, "epoch": 0.344640434192673, "frac_reward_zero_std": 1.0, "grad_norm": 8.765631065443813e-08, "kl": 0.0, "learning_rate": 3.3350586611456175e-07, "loss": 0.0, "num_tokens": 33899310.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1016 }, { "completion_length": 1266.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3236.0, "completions/max_terminated_length": 3236.0, "completions/mean_length": 1266.5, "completions/mean_terminated_length": 1266.5, "completions/min_length": 546.0, "completions/min_terminated_length": 546.0, "epoch": 0.3449796472184532, "frac_reward_zero_std": 1.0, "grad_norm": 2.2533848209604912e-07, "kl": 0.0, "learning_rate": 3.333333333333333e-07, "loss": 0.0, "num_tokens": 33925434.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1017 }, { "completion_length": 1280.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4005.0, "completions/max_terminated_length": 4005.0, "completions/mean_length": 1280.0833740234375, "completions/mean_terminated_length": 1280.0833740234375, "completions/min_length": 566.0, "completions/min_terminated_length": 566.0, "epoch": 0.34531886024423336, "frac_reward_zero_std": 0.5, "grad_norm": 0.06866706162691116, "kl": 0.0, "learning_rate": 3.331608005521049e-07, "loss": -0.0021, "num_tokens": 33955525.0, "reward": 0.7833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1018 }, { "completion_length": 801.0833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1541.0, "completions/max_terminated_length": 1541.0, "completions/mean_length": 801.0833740234375, "completions/mean_terminated_length": 801.0833740234375, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "epoch": 0.3456580732700136, "frac_reward_zero_std": 1.0, "grad_norm": 1.2831240780997177e-07, "kl": 0.0, "learning_rate": 3.3298826777087646e-07, "loss": 0.0, "num_tokens": 33976886.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1019 }, { "completion_length": 1005.6667175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 2369.0, "completions/max_terminated_length": 2369.0, "completions/mean_length": 1005.6666870117188, "completions/mean_terminated_length": 1005.6666870117188, "completions/min_length": 372.0, "completions/min_terminated_length": 372.0, "epoch": 0.34599728629579374, "frac_reward_zero_std": 0.5, "grad_norm": 0.553555428981781, "kl": 0.0, "learning_rate": 3.3281573498964806e-07, "loss": -0.0182, "num_tokens": 34000294.0, "reward": 1.0499999523162842, "reward_std": 0.27386125922203064, "rewards/correctness_reward_func/mean": 0.75, "rewards/correctness_reward_func/std": 0.45226702094078064, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1020 }, { "completion_length": 1262.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2686.0, "completions/max_terminated_length": 2686.0, "completions/mean_length": 1262.25, "completions/mean_terminated_length": 1262.25, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "epoch": 0.34633649932157395, "frac_reward_zero_std": 0.5, "grad_norm": 0.09531509131193161, "kl": 0.0, "learning_rate": 3.3264320220841956e-07, "loss": -0.0011, "num_tokens": 34021825.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1021 }, { "completion_length": 1551.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3782.0, "completions/max_terminated_length": 3782.0, "completions/mean_length": 1551.166748046875, "completions/mean_terminated_length": 1551.166748046875, "completions/min_length": 459.0, "completions/min_terminated_length": 459.0, "epoch": 0.3466757123473541, "frac_reward_zero_std": 1.0, "grad_norm": 1.1376604192037121e-07, "kl": 0.0, "learning_rate": 3.3247066942719117e-07, "loss": 0.0, "num_tokens": 34053189.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1022 }, { "completion_length": 836.1666870117188, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 2006.0, "completions/mean_length": 1385.25, "completions/mean_terminated_length": 912.1818237304688, "completions/min_length": 449.0, "completions/min_terminated_length": 449.0, "epoch": 0.34701492537313433, "frac_reward_zero_std": 0.0, "grad_norm": 0.10201205313205719, "kl": NaN, "learning_rate": 3.322981366459627e-07, "loss": -0.0034, "num_tokens": 34073795.0, "reward": 0.7250000834465027, "reward_std": 0.11600949615240097, "rewards/correctness_reward_func/mean": 0.45000001788139343, "rewards/correctness_reward_func/std": 0.47577688097953796, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 1023 }, { "completion_length": 2209.666748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5226.0, "completions/mean_length": 2758.75, "completions/mean_terminated_length": 2410.54541015625, "completions/min_length": 1073.0, "completions/min_terminated_length": 1073.0, "epoch": 0.3473541383989145, "frac_reward_zero_std": 0.0, "grad_norm": 0.8708356618881226, "kl": NaN, "learning_rate": 3.321256038647343e-07, "loss": -0.0279, "num_tokens": 34117309.0, "reward": 0.8916666507720947, "reward_std": 0.4566118121147156, "rewards/correctness_reward_func/mean": 0.6166666150093079, "rewards/correctness_reward_func/std": 0.4628632962703705, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 1024 }, { "completion_length": 1980.25, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4559.0, "completions/mean_length": 2529.33349609375, "completions/mean_terminated_length": 2160.272705078125, "completions/min_length": 1070.0, "completions/min_terminated_length": 1070.0, "epoch": 0.3476933514246947, "frac_reward_zero_std": 0.5, "grad_norm": 0.2008478343486786, "kl": NaN, "learning_rate": 3.3195307108350583e-07, "loss": -0.0101, "num_tokens": 34154560.0, "reward": 0.7749999761581421, "reward_std": 0.06123724579811096, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 1025 }, { "completion_length": 781.0000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1404.0, "completions/max_terminated_length": 1404.0, "completions/mean_length": 781.0, "completions/mean_terminated_length": 781.0, "completions/min_length": 445.0, "completions/min_terminated_length": 445.0, "epoch": 0.3480325644504749, "frac_reward_zero_std": 0.5, "grad_norm": 0.07265475392341614, "kl": 0.0, "learning_rate": 3.3178053830227743e-07, "loss": -0.0003, "num_tokens": 34178848.0, "reward": 1.2333333492279053, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.9333333373069763, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1026 }, { "completion_length": 2321.166748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4887.0, "completions/mean_length": 2870.25, "completions/mean_terminated_length": 2532.181884765625, "completions/min_length": 852.0, "completions/min_terminated_length": 852.0, "epoch": 0.3483717774762551, "frac_reward_zero_std": 0.5, "grad_norm": 0.12479604035615921, "kl": NaN, "learning_rate": 3.31608005521049e-07, "loss": -0.0099, "num_tokens": 34221102.0, "reward": 0.6750000715255737, "reward_std": 0.06123724579811096, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 1027 }, { "completion_length": 1104.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 2275.0, "completions/max_terminated_length": 2275.0, "completions/mean_length": 1104.25, "completions/mean_terminated_length": 1104.25, "completions/min_length": 667.0, "completions/min_terminated_length": 667.0, "epoch": 0.3487109905020353, "frac_reward_zero_std": 0.5, "grad_norm": 0.0727585181593895, "kl": 0.0, "learning_rate": 3.3143547273982054e-07, "loss": 0.0004, "num_tokens": 34247379.0, "reward": 1.2333333492279053, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.9333333373069763, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1028 }, { "completion_length": 2219.7501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 5237.0, "completions/max_terminated_length": 5237.0, "completions/mean_length": 2219.75, "completions/mean_terminated_length": 2219.75, "completions/min_length": 422.0, "completions/min_terminated_length": 422.0, "epoch": 0.34905020352781546, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.312629399585921e-07, "loss": 0.0, "num_tokens": 34287444.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1029 }, { "completion_length": 1435.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 4003.0, "completions/max_terminated_length": 4003.0, "completions/mean_length": 1435.166748046875, "completions/mean_terminated_length": 1435.166748046875, "completions/min_length": 652.0, "completions/min_terminated_length": 652.0, "epoch": 0.3493894165535957, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.310904071773637e-07, "loss": 0.0, "num_tokens": 34319492.0, "reward": 0.30000001192092896, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1030 }, { "completion_length": 798.0833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 2243.0, "completions/max_terminated_length": 2243.0, "completions/mean_length": 798.0833740234375, "completions/mean_terminated_length": 798.0833740234375, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 0.34972862957937584, "frac_reward_zero_std": 0.5, "grad_norm": 0.061374977231025696, "kl": 0.0, "learning_rate": 3.3091787439613525e-07, "loss": -0.0004, "num_tokens": 34343367.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1031 }, { "completion_length": 1524.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2793.0, "completions/max_terminated_length": 2793.0, "completions/mean_length": 1524.3333740234375, "completions/mean_terminated_length": 1524.3333740234375, "completions/min_length": 512.0, "completions/min_terminated_length": 512.0, "epoch": 0.35006784260515605, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.307453416149068e-07, "loss": 0.0, "num_tokens": 34371847.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1032 }, { "completion_length": 1157.9166870117188, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 2577.0, "completions/mean_length": 1707.0, "completions/mean_terminated_length": 1263.181884765625, "completions/min_length": 603.0, "completions/min_terminated_length": 603.0, "epoch": 0.3504070556309362, "frac_reward_zero_std": 0.5, "grad_norm": 0.21980571746826172, "kl": NaN, "learning_rate": 3.305728088336784e-07, "loss": -0.0175, "num_tokens": 34393740.0, "reward": 1.0750000476837158, "reward_std": 0.2602882981300354, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.2696799635887146, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 1033 }, { "completion_length": 2073.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3709.0, "completions/max_terminated_length": 3709.0, "completions/mean_length": 2073.08349609375, "completions/mean_terminated_length": 2073.08349609375, "completions/min_length": 487.0, "completions/min_terminated_length": 487.0, "epoch": 0.35074626865671643, "frac_reward_zero_std": 1.0, "grad_norm": 2.0707614112325246e-07, "kl": 0.0, "learning_rate": 3.3040027605244996e-07, "loss": 0.0, "num_tokens": 34426879.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1034 }, { "completion_length": 3526.5001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 6434.0, "completions/max_terminated_length": 6434.0, "completions/mean_length": 3526.5, "completions/mean_terminated_length": 3526.5, "completions/min_length": 1387.0, "completions/min_terminated_length": 1387.0, "epoch": 0.3510854816824966, "frac_reward_zero_std": 1.0, "grad_norm": 3.749959489596222e-07, "kl": 0.0, "learning_rate": 3.302277432712215e-07, "loss": 0.0, "num_tokens": 34480735.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1035 }, { "completion_length": 639.5000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1067.0, "completions/max_terminated_length": 1067.0, "completions/mean_length": 639.5, "completions/mean_terminated_length": 639.5, "completions/min_length": 394.0, "completions/min_terminated_length": 394.0, "epoch": 0.3514246947082768, "frac_reward_zero_std": 1.0, "grad_norm": 9.25174816757135e-08, "kl": 0.0, "learning_rate": 3.3005521048999307e-07, "loss": 0.0, "num_tokens": 34498969.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1036 }, { "completion_length": 1495.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3549.0, "completions/max_terminated_length": 3549.0, "completions/mean_length": 1495.0, "completions/mean_terminated_length": 1495.0, "completions/min_length": 826.0, "completions/min_terminated_length": 826.0, "epoch": 0.35176390773405697, "frac_reward_zero_std": 0.0, "grad_norm": 0.15128767490386963, "kl": 0.0, "learning_rate": 3.298826777087647e-07, "loss": 0.0045, "num_tokens": 34529755.0, "reward": 1.183333396911621, "reward_std": 0.09246458858251572, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298573225736618, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1037 }, { "completion_length": 2558.33349609375, "completions/clipped_ratio": 0.0, "completions/max_length": 3675.0, "completions/max_terminated_length": 3675.0, "completions/mean_length": 2558.33349609375, "completions/mean_terminated_length": 2558.33349609375, "completions/min_length": 1327.0, "completions/min_terminated_length": 1327.0, "epoch": 0.3521031207598372, "frac_reward_zero_std": 0.0, "grad_norm": 0.554895281791687, "kl": 0.0, "learning_rate": 3.2971014492753623e-07, "loss": 0.0118, "num_tokens": 34573739.0, "reward": 1.1375000476837158, "reward_std": 0.25670480728149414, "rewards/correctness_reward_func/mean": 0.8499999642372131, "rewards/correctness_reward_func/std": 0.28444522619247437, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 1038 }, { "completion_length": 2352.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4381.0, "completions/max_terminated_length": 4381.0, "completions/mean_length": 2352.83349609375, "completions/mean_terminated_length": 2352.83349609375, "completions/min_length": 502.0, "completions/min_terminated_length": 502.0, "epoch": 0.35244233378561735, "frac_reward_zero_std": 0.0, "grad_norm": 0.6315206289291382, "kl": 0.0, "learning_rate": 3.295376121463078e-07, "loss": 0.0128, "num_tokens": 34614351.0, "reward": 1.2041666507720947, "reward_std": 0.23474276065826416, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 1039 }, { "completion_length": 2527.8333740234375, "completions/clipped_ratio": 0.41666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 6284.0, "completions/mean_length": 5273.25, "completions/mean_terminated_length": 4333.4287109375, "completions/min_length": 1762.0, "completions/min_terminated_length": 1762.0, "epoch": 0.35278154681139756, "frac_reward_zero_std": 0.0, "grad_norm": 0.882822573184967, "kl": NaN, "learning_rate": 3.2936507936507933e-07, "loss": -0.0828, "num_tokens": 34655899.0, "reward": 0.6708333492279053, "reward_std": 0.5636676549911499, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.43029236793518066, "rewards/format_reward_func/mean": 0.1875, "rewards/format_reward_func/std": 0.14479610323905945, "step": 1040 }, { "completion_length": 1147.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1597.0, "completions/max_terminated_length": 1597.0, "completions/mean_length": 1147.8333740234375, "completions/mean_terminated_length": 1147.8333740234375, "completions/min_length": 572.0, "completions/min_terminated_length": 572.0, "epoch": 0.3531207598371777, "frac_reward_zero_std": 1.0, "grad_norm": 2.8061339207852143e-07, "kl": 0.0, "learning_rate": 3.2919254658385094e-07, "loss": 0.0, "num_tokens": 34686149.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1041 }, { "completion_length": 1095.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1714.0, "completions/max_terminated_length": 1714.0, "completions/mean_length": 1095.0833740234375, "completions/mean_terminated_length": 1095.0833740234375, "completions/min_length": 591.0, "completions/min_terminated_length": 591.0, "epoch": 0.35345997286295794, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.290200138026225e-07, "loss": 0.0, "num_tokens": 34711404.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1042 }, { "completion_length": 1427.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2875.0, "completions/max_terminated_length": 2875.0, "completions/mean_length": 1427.416748046875, "completions/mean_terminated_length": 1427.416748046875, "completions/min_length": 530.0, "completions/min_terminated_length": 530.0, "epoch": 0.35379918588873815, "frac_reward_zero_std": 0.5, "grad_norm": 0.09221986681222916, "kl": 0.0, "learning_rate": 3.2884748102139404e-07, "loss": -0.002, "num_tokens": 34740341.0, "reward": 0.7166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1043 }, { "completion_length": 925.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1923.0, "completions/max_terminated_length": 1923.0, "completions/mean_length": 925.25, "completions/mean_terminated_length": 925.25, "completions/min_length": 459.0, "completions/min_terminated_length": 459.0, "epoch": 0.3541383989145183, "frac_reward_zero_std": 0.5, "grad_norm": 0.07247622311115265, "kl": 0.0, "learning_rate": 3.286749482401656e-07, "loss": -0.001, "num_tokens": 34763696.0, "reward": 1.2166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1044 }, { "completion_length": 706.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1294.0, "completions/max_terminated_length": 1294.0, "completions/mean_length": 706.25, "completions/mean_terminated_length": 706.25, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "epoch": 0.35447761194029853, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.285024154589372e-07, "loss": 0.0, "num_tokens": 34786655.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1045 }, { "completion_length": 1186.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1757.0, "completions/max_terminated_length": 1757.0, "completions/mean_length": 1186.0833740234375, "completions/mean_terminated_length": 1186.0833740234375, "completions/min_length": 758.0, "completions/min_terminated_length": 758.0, "epoch": 0.3548168249660787, "frac_reward_zero_std": 1.0, "grad_norm": 2.146434923133711e-07, "kl": 0.0, "learning_rate": 3.283298826777087e-07, "loss": 0.0, "num_tokens": 34811250.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1046 }, { "completion_length": 2200.0833740234375, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 5109.0, "completions/mean_length": 3847.33349609375, "completions/mean_terminated_length": 2933.444580078125, "completions/min_length": 711.0, "completions/min_terminated_length": 711.0, "epoch": 0.3551560379918589, "frac_reward_zero_std": 0.0, "grad_norm": 0.8312497735023499, "kl": NaN, "learning_rate": 3.281573498964803e-07, "loss": -0.0789, "num_tokens": 34850137.0, "reward": 0.49166667461395264, "reward_std": 0.3452560305595398, "rewards/correctness_reward_func/mean": 0.2666666805744171, "rewards/correctness_reward_func/std": 0.393892765045166, "rewards/format_reward_func/mean": 0.2250000238418579, "rewards/format_reward_func/std": 0.13568010926246643, "step": 1047 }, { "completion_length": 580.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 982.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 580.8333740234375, "completions/mean_terminated_length": 580.8333740234375, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "epoch": 0.35549525101763907, "frac_reward_zero_std": 0.5, "grad_norm": 0.04787130281329155, "kl": 0.0, "learning_rate": 3.2798481711525186e-07, "loss": 0.0002, "num_tokens": 34865855.0, "reward": 0.7833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1048 }, { "completion_length": 750.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1243.0, "completions/max_terminated_length": 1243.0, "completions/mean_length": 750.0833740234375, "completions/mean_terminated_length": 750.0833740234375, "completions/min_length": 374.0, "completions/min_terminated_length": 374.0, "epoch": 0.3558344640434193, "frac_reward_zero_std": 0.5, "grad_norm": 0.0681418925523758, "kl": 0.0, "learning_rate": 3.2781228433402347e-07, "loss": -0.0008, "num_tokens": 34891386.0, "reward": 1.2166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298573225736618, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1049 }, { "completion_length": 983.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1408.0, "completions/max_terminated_length": 1408.0, "completions/mean_length": 983.1666870117188, "completions/mean_terminated_length": 983.1666870117188, "completions/min_length": 635.0, "completions/min_terminated_length": 635.0, "epoch": 0.35617367706919945, "frac_reward_zero_std": 1.0, "grad_norm": 2.2920099240764102e-07, "kl": 0.0, "learning_rate": 3.27639751552795e-07, "loss": 0.0, "num_tokens": 34912154.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1050 }, { "completion_length": 1352.5, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 4031.0, "completions/mean_length": 2450.666748046875, "completions/mean_terminated_length": 1623.0, "completions/min_length": 601.0, "completions/min_terminated_length": 601.0, "epoch": 0.35651289009497966, "frac_reward_zero_std": 0.0, "grad_norm": 0.4123319685459137, "kl": NaN, "learning_rate": 3.2746721877156657e-07, "loss": -0.0508, "num_tokens": 34939790.0, "reward": 1.0166666507720947, "reward_std": 0.3872982859611511, "rewards/correctness_reward_func/mean": 0.7666666507720947, "rewards/correctness_reward_func/std": 0.3700941801071167, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.11677484214305878, "step": 1051 }, { "completion_length": 1663.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3074.0, "completions/max_terminated_length": 3074.0, "completions/mean_length": 1663.0833740234375, "completions/mean_terminated_length": 1663.0833740234375, "completions/min_length": 892.0, "completions/min_terminated_length": 892.0, "epoch": 0.3568521031207598, "frac_reward_zero_std": 0.5, "grad_norm": 0.08811885118484497, "kl": 0.0, "learning_rate": 3.272946859903382e-07, "loss": -0.0007, "num_tokens": 34972455.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1052 }, { "completion_length": 869.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1839.0, "completions/max_terminated_length": 1839.0, "completions/mean_length": 869.75, "completions/mean_terminated_length": 869.75, "completions/min_length": 420.0, "completions/min_terminated_length": 420.0, "epoch": 0.35719131614654004, "frac_reward_zero_std": 0.5, "grad_norm": 0.44060760736465454, "kl": 0.0, "learning_rate": 3.2712215320910973e-07, "loss": 0.0007, "num_tokens": 34993146.0, "reward": 1.1000001430511475, "reward_std": 0.23664319515228271, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.38138505816459656, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1053 }, { "completion_length": 1207.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1850.0, "completions/max_terminated_length": 1850.0, "completions/mean_length": 1207.916748046875, "completions/mean_terminated_length": 1207.916748046875, "completions/min_length": 693.0, "completions/min_terminated_length": 693.0, "epoch": 0.3575305291723202, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.269496204278813e-07, "loss": 0.0, "num_tokens": 35015045.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1054 }, { "completion_length": 1001.9167175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 2617.0, "completions/max_terminated_length": 2617.0, "completions/mean_length": 1001.9166870117188, "completions/mean_terminated_length": 1001.9166870117188, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.3578697421981004, "frac_reward_zero_std": 1.0, "grad_norm": 2.670721244157903e-07, "kl": 0.0, "learning_rate": 3.2677708764665284e-07, "loss": 0.0, "num_tokens": 35038096.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1055 }, { "completion_length": 1123.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2157.0, "completions/max_terminated_length": 2157.0, "completions/mean_length": 1123.916748046875, "completions/mean_terminated_length": 1123.916748046875, "completions/min_length": 679.0, "completions/min_terminated_length": 679.0, "epoch": 0.3582089552238806, "frac_reward_zero_std": 1.0, "grad_norm": 1.5028514610548882e-07, "kl": 0.0, "learning_rate": 3.2660455486542444e-07, "loss": 0.0, "num_tokens": 35058237.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1056 }, { "completion_length": 1138.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2538.0, "completions/max_terminated_length": 2538.0, "completions/mean_length": 1138.8333740234375, "completions/mean_terminated_length": 1138.8333740234375, "completions/min_length": 518.0, "completions/min_terminated_length": 518.0, "epoch": 0.3585481682496608, "frac_reward_zero_std": 0.5, "grad_norm": 0.06925853341817856, "kl": 0.0, "learning_rate": 3.2643202208419594e-07, "loss": -0.001, "num_tokens": 35083987.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1057 }, { "completion_length": 1512.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2942.0, "completions/max_terminated_length": 2942.0, "completions/mean_length": 1512.416748046875, "completions/mean_terminated_length": 1512.416748046875, "completions/min_length": 415.0, "completions/min_terminated_length": 415.0, "epoch": 0.35888738127544095, "frac_reward_zero_std": 0.5, "grad_norm": 0.10702119767665863, "kl": 0.0, "learning_rate": 3.2625948930296755e-07, "loss": -0.0041, "num_tokens": 35117148.0, "reward": 1.1500000953674316, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.8500000834465027, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1058 }, { "completion_length": 2390.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5217.0, "completions/max_terminated_length": 5217.0, "completions/mean_length": 2390.25, "completions/mean_terminated_length": 2390.25, "completions/min_length": 1420.0, "completions/min_terminated_length": 1420.0, "epoch": 0.35922659430122117, "frac_reward_zero_std": 0.5, "grad_norm": 0.12189286202192307, "kl": 0.0, "learning_rate": 3.260869565217391e-07, "loss": -0.0019, "num_tokens": 35156829.0, "reward": 1.1500000953674316, "reward_std": 0.05477222427725792, "rewards/correctness_reward_func/mean": 0.8500000834465027, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1059 }, { "completion_length": 1614.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 4585.0, "completions/max_terminated_length": 4585.0, "completions/mean_length": 1614.916748046875, "completions/mean_terminated_length": 1614.916748046875, "completions/min_length": 416.0, "completions/min_terminated_length": 416.0, "epoch": 0.35956580732700133, "frac_reward_zero_std": 1.0, "grad_norm": 2.0543426160202216e-07, "kl": 0.0, "learning_rate": 3.259144237405107e-07, "loss": 0.0, "num_tokens": 35191580.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1060 }, { "completion_length": 1448.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3261.0, "completions/max_terminated_length": 3261.0, "completions/mean_length": 1448.3333740234375, "completions/mean_terminated_length": 1448.3333740234375, "completions/min_length": 439.0, "completions/min_terminated_length": 439.0, "epoch": 0.35990502035278155, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.257418909592822e-07, "loss": 0.0, "num_tokens": 35219790.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1061 }, { "completion_length": 1124.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2019.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 1124.5833740234375, "completions/mean_terminated_length": 1124.5833740234375, "completions/min_length": 559.0, "completions/min_terminated_length": 559.0, "epoch": 0.36024423337856176, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.255693581780538e-07, "loss": 0.0, "num_tokens": 35243779.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1062 }, { "completion_length": 774.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1280.0, "completions/max_terminated_length": 1280.0, "completions/mean_length": 774.4166870117188, "completions/mean_terminated_length": 774.4166870117188, "completions/min_length": 396.0, "completions/min_terminated_length": 396.0, "epoch": 0.3605834464043419, "frac_reward_zero_std": 0.5, "grad_norm": 0.06407224386930466, "kl": 0.0, "learning_rate": 3.2539682539682537e-07, "loss": 0.0003, "num_tokens": 35262690.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1063 }, { "completion_length": 1269.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2669.0, "completions/max_terminated_length": 2669.0, "completions/mean_length": 1269.25, "completions/mean_terminated_length": 1269.25, "completions/min_length": 522.0, "completions/min_terminated_length": 522.0, "epoch": 0.36092265943012214, "frac_reward_zero_std": 0.5, "grad_norm": 0.10210468620061874, "kl": 0.0, "learning_rate": 3.2522429261559697e-07, "loss": 0.0008, "num_tokens": 35290443.0, "reward": 1.2333333492279053, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.9333333373069763, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1064 }, { "completion_length": 1494.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2720.0, "completions/max_terminated_length": 2720.0, "completions/mean_length": 1494.75, "completions/mean_terminated_length": 1494.75, "completions/min_length": 611.0, "completions/min_terminated_length": 611.0, "epoch": 0.3612618724559023, "frac_reward_zero_std": 0.5, "grad_norm": 0.13579928874969482, "kl": 0.0, "learning_rate": 3.250517598343685e-07, "loss": 0.0009, "num_tokens": 35318262.0, "reward": 1.133333444595337, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1065 }, { "completion_length": 1385.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2323.0, "completions/max_terminated_length": 2323.0, "completions/mean_length": 1385.5833740234375, "completions/mean_terminated_length": 1385.5833740234375, "completions/min_length": 870.0, "completions/min_terminated_length": 870.0, "epoch": 0.3616010854816825, "frac_reward_zero_std": 0.5, "grad_norm": 0.09532253444194794, "kl": 0.0, "learning_rate": 3.248792270531401e-07, "loss": -0.0009, "num_tokens": 35347231.0, "reward": 0.75, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.44999995827674866, "rewards/correctness_reward_func/std": 0.47577688097953796, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1066 }, { "completion_length": 1192.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2195.0, "completions/max_terminated_length": 2195.0, "completions/mean_length": 1192.666748046875, "completions/mean_terminated_length": 1192.666748046875, "completions/min_length": 475.0, "completions/min_terminated_length": 475.0, "epoch": 0.3619402985074627, "frac_reward_zero_std": 0.5, "grad_norm": 0.0895840972661972, "kl": 0.0, "learning_rate": 3.247066942719117e-07, "loss": -0.0032, "num_tokens": 35372319.0, "reward": 1.2666666507720947, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1067 }, { "completion_length": 1045.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1523.0, "completions/max_terminated_length": 1523.0, "completions/mean_length": 1045.666748046875, "completions/mean_terminated_length": 1045.666748046875, "completions/min_length": 492.0, "completions/min_terminated_length": 492.0, "epoch": 0.3622795115332429, "frac_reward_zero_std": 0.5, "grad_norm": 0.3944280743598938, "kl": 0.0, "learning_rate": 3.245341614906832e-07, "loss": 0.0072, "num_tokens": 35395643.0, "reward": 0.5, "reward_std": 0.21908903121948242, "rewards/correctness_reward_func/mean": 0.20000000298023224, "rewards/correctness_reward_func/std": 0.36181363463401794, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1068 }, { "completion_length": 1711.1666870117188, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 6458.0, "completions/mean_length": 3358.416748046875, "completions/mean_terminated_length": 2281.5556640625, "completions/min_length": 667.0, "completions/min_terminated_length": 667.0, "epoch": 0.36261872455902305, "frac_reward_zero_std": 0.5, "grad_norm": 1.050557017326355, "kl": NaN, "learning_rate": 3.243616287094548e-07, "loss": -0.0541, "num_tokens": 35431939.0, "reward": 0.9041666388511658, "reward_std": 0.3116154074668884, "rewards/correctness_reward_func/mean": 0.6666666865348816, "rewards/correctness_reward_func/std": 0.4923659861087799, "rewards/format_reward_func/mean": 0.23750001192092896, "rewards/format_reward_func/std": 0.11894422769546509, "step": 1069 }, { "completion_length": 1705.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3476.0, "completions/max_terminated_length": 3476.0, "completions/mean_length": 1705.416748046875, "completions/mean_terminated_length": 1705.416748046875, "completions/min_length": 857.0, "completions/min_terminated_length": 857.0, "epoch": 0.36295793758480327, "frac_reward_zero_std": 0.5, "grad_norm": 0.09383396804332733, "kl": 0.0, "learning_rate": 3.2418909592822634e-07, "loss": 0.0001, "num_tokens": 35466522.0, "reward": 1.1666667461395264, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1070 }, { "completion_length": 1608.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3873.0, "completions/max_terminated_length": 3873.0, "completions/mean_length": 1608.916748046875, "completions/mean_terminated_length": 1608.916748046875, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "epoch": 0.36329715061058343, "frac_reward_zero_std": 1.0, "grad_norm": 2.2577610536700377e-07, "kl": 0.0, "learning_rate": 3.2401656314699795e-07, "loss": 0.0, "num_tokens": 35495501.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1071 }, { "completion_length": 922.4167175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 2342.0, "completions/max_terminated_length": 2342.0, "completions/mean_length": 922.4166870117188, "completions/mean_terminated_length": 922.4166870117188, "completions/min_length": 445.0, "completions/min_terminated_length": 445.0, "epoch": 0.36363636363636365, "frac_reward_zero_std": 1.0, "grad_norm": 1.1904192120937296e-07, "kl": 0.0, "learning_rate": 3.2384403036576945e-07, "loss": 0.0, "num_tokens": 35518888.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1072 }, { "completion_length": 2646.58349609375, "completions/clipped_ratio": 0.0, "completions/max_length": 5065.0, "completions/max_terminated_length": 5065.0, "completions/mean_length": 2646.58349609375, "completions/mean_terminated_length": 2646.58349609375, "completions/min_length": 1298.0, "completions/min_terminated_length": 1298.0, "epoch": 0.3639755766621438, "frac_reward_zero_std": 0.5, "grad_norm": 0.4912077486515045, "kl": 0.0, "learning_rate": 3.2367149758454105e-07, "loss": 0.0241, "num_tokens": 35567849.0, "reward": 0.7000000476837158, "reward_std": 0.20000001788139343, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4972652792930603, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1073 }, { "completion_length": 1210.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 4130.0, "completions/max_terminated_length": 4130.0, "completions/mean_length": 1210.416748046875, "completions/mean_terminated_length": 1210.416748046875, "completions/min_length": 394.0, "completions/min_terminated_length": 394.0, "epoch": 0.364314789687924, "frac_reward_zero_std": 1.0, "grad_norm": 2.0099106734505767e-07, "kl": 0.0, "learning_rate": 3.234989648033126e-07, "loss": 0.0, "num_tokens": 35592238.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1074 }, { "completion_length": 1294.8333740234375, "completions/clipped_ratio": 0.33333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6544.0, "completions/mean_length": 3491.166748046875, "completions/mean_terminated_length": 1942.25, "completions/min_length": 587.0, "completions/min_terminated_length": 587.0, "epoch": 0.3646540027137042, "frac_reward_zero_std": 0.5, "grad_norm": 0.7787041068077087, "kl": NaN, "learning_rate": 3.233264320220842e-07, "loss": -0.109, "num_tokens": 35618414.0, "reward": 0.8666666746139526, "reward_std": 0.3356585204601288, "rewards/correctness_reward_func/mean": 0.6666666865348816, "rewards/correctness_reward_func/std": 0.4923659861087799, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.14770980179309845, "step": 1075 }, { "completion_length": 585.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 790.0, "completions/max_terminated_length": 790.0, "completions/mean_length": 585.3333740234375, "completions/mean_terminated_length": 585.3333740234375, "completions/min_length": 431.0, "completions/min_terminated_length": 431.0, "epoch": 0.3649932157394844, "frac_reward_zero_std": 0.5, "grad_norm": 0.05329876020550728, "kl": 0.0, "learning_rate": 3.231538992408557e-07, "loss": 0.0003, "num_tokens": 35636064.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1076 }, { "completion_length": 1593.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2615.0, "completions/max_terminated_length": 2615.0, "completions/mean_length": 1593.75, "completions/mean_terminated_length": 1593.75, "completions/min_length": 780.0, "completions/min_terminated_length": 780.0, "epoch": 0.36533242876526456, "frac_reward_zero_std": 1.0, "grad_norm": 2.393840645709133e-07, "kl": 0.0, "learning_rate": 3.229813664596273e-07, "loss": 0.0, "num_tokens": 35672895.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1077 }, { "completion_length": 748.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1226.0, "completions/max_terminated_length": 1226.0, "completions/mean_length": 748.4166870117188, "completions/mean_terminated_length": 748.4166870117188, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "epoch": 0.3656716417910448, "frac_reward_zero_std": 0.5, "grad_norm": 0.05215156450867653, "kl": 0.0, "learning_rate": 3.2280883367839887e-07, "loss": -0.0002, "num_tokens": 35693174.0, "reward": 0.7833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1078 }, { "completion_length": 734.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1519.0, "completions/max_terminated_length": 1519.0, "completions/mean_length": 734.8333740234375, "completions/mean_terminated_length": 734.8333740234375, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "epoch": 0.366010854816825, "frac_reward_zero_std": 0.0, "grad_norm": 0.33065640926361084, "kl": 0.0, "learning_rate": 3.226363008971704e-07, "loss": 0.0004, "num_tokens": 35716602.0, "reward": 1.1666667461395264, "reward_std": 0.2588964104652405, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.287096232175827, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1079 }, { "completion_length": 903.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2838.0, "completions/max_terminated_length": 2838.0, "completions/mean_length": 903.5833740234375, "completions/mean_terminated_length": 903.5833740234375, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "epoch": 0.36635006784260515, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.2246376811594203e-07, "loss": 0.0, "num_tokens": 35742403.0, "reward": 0.30000001192092896, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1080 }, { "completion_length": 686.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 975.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 686.9166870117188, "completions/mean_terminated_length": 686.9166870117188, "completions/min_length": 477.0, "completions/min_terminated_length": 477.0, "epoch": 0.36668928086838537, "frac_reward_zero_std": 0.0, "grad_norm": 0.11860223859548569, "kl": 0.0, "learning_rate": 3.222912353347136e-07, "loss": 0.0001, "num_tokens": 35763444.0, "reward": 1.2000000476837158, "reward_std": 0.10327951610088348, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1081 }, { "completion_length": 1727.4166870117188, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 4944.0, "completions/mean_length": 2825.58349609375, "completions/mean_terminated_length": 2072.900146484375, "completions/min_length": 612.0, "completions/min_terminated_length": 612.0, "epoch": 0.36702849389416553, "frac_reward_zero_std": 0.5, "grad_norm": 0.4601011574268341, "kl": NaN, "learning_rate": 3.221187025534852e-07, "loss": -0.0652, "num_tokens": 35795489.0, "reward": 0.5166666507720947, "reward_std": 0.2840188145637512, "rewards/correctness_reward_func/mean": 0.2666666805744171, "rewards/correctness_reward_func/std": 0.393892765045166, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.11677484214305878, "step": 1082 }, { "completion_length": 1239.0833740234375, "completions/clipped_ratio": 0.33333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4683.0, "completions/mean_length": 3435.416748046875, "completions/mean_terminated_length": 1858.625, "completions/min_length": 539.0, "completions/min_terminated_length": 539.0, "epoch": 0.36736770691994575, "frac_reward_zero_std": 0.0, "grad_norm": 0.2984149754047394, "kl": NaN, "learning_rate": 3.219461697722567e-07, "loss": -0.0243, "num_tokens": 35819514.0, "reward": 0.6833333373069763, "reward_std": 0.11828448623418808, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.14770980179309845, "step": 1083 }, { "completion_length": 2202.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3487.0, "completions/max_terminated_length": 3487.0, "completions/mean_length": 2202.166748046875, "completions/mean_terminated_length": 2202.166748046875, "completions/min_length": 1477.0, "completions/min_terminated_length": 1477.0, "epoch": 0.3677069199457259, "frac_reward_zero_std": 0.0, "grad_norm": 0.5952599048614502, "kl": 0.0, "learning_rate": 3.217736369910283e-07, "loss": 0.0046, "num_tokens": 35861030.0, "reward": 0.9000000357627869, "reward_std": 0.3098386228084564, "rewards/correctness_reward_func/mean": 0.5999999642372131, "rewards/correctness_reward_func/std": 0.45126086473464966, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1084 }, { "completion_length": 1787.0000610351562, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5498.0, "completions/mean_length": 2336.08349609375, "completions/mean_terminated_length": 1949.45458984375, "completions/min_length": 479.0, "completions/min_terminated_length": 479.0, "epoch": 0.3680461329715061, "frac_reward_zero_std": 0.5, "grad_norm": 0.24942909181118011, "kl": NaN, "learning_rate": 3.2160110420979985e-07, "loss": -0.0408, "num_tokens": 35896934.0, "reward": 1.1083333492279053, "reward_std": 0.22453658282756805, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.2806917726993561, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 1085 }, { "completion_length": 2412.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 5487.0, "completions/max_terminated_length": 5487.0, "completions/mean_length": 2412.83349609375, "completions/mean_terminated_length": 2412.83349609375, "completions/min_length": 699.0, "completions/min_terminated_length": 699.0, "epoch": 0.3683853459972863, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.2142857142857145e-07, "loss": 0.0, "num_tokens": 35938458.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1086 }, { "completion_length": 1142.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 1522.0, "completions/max_terminated_length": 1522.0, "completions/mean_length": 1142.166748046875, "completions/mean_terminated_length": 1142.166748046875, "completions/min_length": 614.0, "completions/min_terminated_length": 614.0, "epoch": 0.3687245590230665, "frac_reward_zero_std": 0.0, "grad_norm": 0.3443276584148407, "kl": 0.0, "learning_rate": 3.2125603864734295e-07, "loss": 0.0016, "num_tokens": 35967794.0, "reward": 1.1166667938232422, "reward_std": 0.2408248484134674, "rewards/correctness_reward_func/mean": 0.8166666030883789, "rewards/correctness_reward_func/std": 0.27579087018966675, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1087 }, { "completion_length": 1364.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3210.0, "completions/max_terminated_length": 3210.0, "completions/mean_length": 1364.5, "completions/mean_terminated_length": 1364.5, "completions/min_length": 648.0, "completions/min_terminated_length": 648.0, "epoch": 0.36906377204884666, "frac_reward_zero_std": 1.0, "grad_norm": 1.1987111747657764e-07, "kl": 0.0, "learning_rate": 3.2108350586611456e-07, "loss": 0.0, "num_tokens": 35995388.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1088 }, { "completion_length": 681.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1033.0, "completions/max_terminated_length": 1033.0, "completions/mean_length": 681.8333740234375, "completions/mean_terminated_length": 681.8333740234375, "completions/min_length": 504.0, "completions/min_terminated_length": 504.0, "epoch": 0.3694029850746269, "frac_reward_zero_std": 0.5, "grad_norm": 0.06151455268263817, "kl": 0.0, "learning_rate": 3.209109730848861e-07, "loss": 0.0001, "num_tokens": 36011646.0, "reward": 0.7666666507720947, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1089 }, { "completion_length": 1568.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2973.0, "completions/max_terminated_length": 2973.0, "completions/mean_length": 1568.5833740234375, "completions/mean_terminated_length": 1568.5833740234375, "completions/min_length": 660.0, "completions/min_terminated_length": 660.0, "epoch": 0.36974219810040704, "frac_reward_zero_std": 0.5, "grad_norm": 0.07670875638723373, "kl": 0.0, "learning_rate": 3.2073844030365766e-07, "loss": -0.0024, "num_tokens": 36042193.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1090 }, { "completion_length": 2774.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5082.0, "completions/max_terminated_length": 5082.0, "completions/mean_length": 2774.916748046875, "completions/mean_terminated_length": 2774.916748046875, "completions/min_length": 1137.0, "completions/min_terminated_length": 1137.0, "epoch": 0.37008141112618725, "frac_reward_zero_std": 0.0, "grad_norm": 0.1478627622127533, "kl": 0.0, "learning_rate": 3.205659075224292e-07, "loss": 0.0016, "num_tokens": 36086142.0, "reward": 1.2000000476837158, "reward_std": 0.08164963126182556, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1091 }, { "completion_length": 1244.9167175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 2087.0, "completions/max_terminated_length": 2087.0, "completions/mean_length": 1244.916748046875, "completions/mean_terminated_length": 1244.916748046875, "completions/min_length": 543.0, "completions/min_terminated_length": 543.0, "epoch": 0.3704206241519674, "frac_reward_zero_std": 0.5, "grad_norm": 0.07772715389728546, "kl": 0.0, "learning_rate": 3.203933747412008e-07, "loss": -0.0008, "num_tokens": 36112439.0, "reward": 0.7666666507720947, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1092 }, { "completion_length": 3700.1668701171875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5875.0, "completions/mean_length": 4249.25, "completions/mean_terminated_length": 4036.545654296875, "completions/min_length": 2466.0, "completions/min_terminated_length": 2466.0, "epoch": 0.37075983717774763, "frac_reward_zero_std": 0.0, "grad_norm": 0.5973675847053528, "kl": NaN, "learning_rate": 3.202208419599724e-07, "loss": -0.0139, "num_tokens": 36169963.0, "reward": 0.5583333373069763, "reward_std": 0.28409743309020996, "rewards/correctness_reward_func/mean": 0.28333333134651184, "rewards/correctness_reward_func/std": 0.4217568039894104, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 1093 }, { "completion_length": 1539.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 4513.0, "completions/max_terminated_length": 4513.0, "completions/mean_length": 1539.25, "completions/mean_terminated_length": 1539.25, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.3710990502035278, "frac_reward_zero_std": 0.5, "grad_norm": 0.08394655585289001, "kl": 0.0, "learning_rate": 3.2004830917874393e-07, "loss": -0.0002, "num_tokens": 36198706.0, "reward": 0.75, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.45000001788139343, "rewards/correctness_reward_func/std": 0.47577688097953796, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1094 }, { "completion_length": 832.0000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1562.0, "completions/max_terminated_length": 1562.0, "completions/mean_length": 832.0, "completions/mean_terminated_length": 832.0, "completions/min_length": 445.0, "completions/min_terminated_length": 445.0, "epoch": 0.371438263229308, "frac_reward_zero_std": 1.0, "grad_norm": 1.5648636519927095e-07, "kl": 0.0, "learning_rate": 3.198757763975155e-07, "loss": 0.0, "num_tokens": 36220414.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1095 }, { "completion_length": 1543.916748046875, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 4848.0, "completions/mean_length": 2642.08349609375, "completions/mean_terminated_length": 1852.7000732421875, "completions/min_length": 525.0, "completions/min_terminated_length": 525.0, "epoch": 0.37177747625508817, "frac_reward_zero_std": 0.5, "grad_norm": 0.9393840432167053, "kl": NaN, "learning_rate": 3.197032436162871e-07, "loss": -0.0529, "num_tokens": 36248469.0, "reward": 0.9291667938232422, "reward_std": 0.26571446657180786, "rewards/correctness_reward_func/mean": 0.6666666865348816, "rewards/correctness_reward_func/std": 0.31139957904815674, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.09323723614215851, "step": 1096 }, { "completion_length": 2080.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 6511.0, "completions/max_terminated_length": 6511.0, "completions/mean_length": 2080.916748046875, "completions/mean_terminated_length": 2080.916748046875, "completions/min_length": 639.0, "completions/min_terminated_length": 639.0, "epoch": 0.3721166892808684, "frac_reward_zero_std": 0.5, "grad_norm": 1.2343807220458984, "kl": 0.0, "learning_rate": 3.195307108350587e-07, "loss": -0.0573, "num_tokens": 36287540.0, "reward": 0.9666666388511658, "reward_std": 0.2581988573074341, "rewards/correctness_reward_func/mean": 0.6666666865348816, "rewards/correctness_reward_func/std": 0.4923659861087799, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1097 }, { "completion_length": 572.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 815.0, "completions/max_terminated_length": 815.0, "completions/mean_length": 572.4166870117188, "completions/mean_terminated_length": 572.4166870117188, "completions/min_length": 453.0, "completions/min_terminated_length": 453.0, "epoch": 0.3724559023066486, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.193581780538302e-07, "loss": 0.0, "num_tokens": 36306535.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1098 }, { "completion_length": 1254.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2624.0, "completions/max_terminated_length": 2624.0, "completions/mean_length": 1254.0833740234375, "completions/mean_terminated_length": 1254.0833740234375, "completions/min_length": 524.0, "completions/min_terminated_length": 524.0, "epoch": 0.37279511533242876, "frac_reward_zero_std": 0.5, "grad_norm": 0.41037648916244507, "kl": 0.0, "learning_rate": 3.191856452726018e-07, "loss": -0.0013, "num_tokens": 36335042.0, "reward": 0.8666667342185974, "reward_std": 0.2581988573074341, "rewards/correctness_reward_func/mean": 0.5666666626930237, "rewards/correctness_reward_func/std": 0.42497774958610535, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1099 }, { "completion_length": 719.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 908.0, "completions/max_terminated_length": 908.0, "completions/mean_length": 719.6666870117188, "completions/mean_terminated_length": 719.6666870117188, "completions/min_length": 519.0, "completions/min_terminated_length": 519.0, "epoch": 0.373134328358209, "frac_reward_zero_std": 0.5, "grad_norm": 0.08051758259534836, "kl": 0.0, "learning_rate": 3.1901311249137335e-07, "loss": -0.001, "num_tokens": 36360886.0, "reward": 1.25, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1100 }, { "completion_length": 1819.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 4713.0, "completions/max_terminated_length": 4713.0, "completions/mean_length": 1819.416748046875, "completions/mean_terminated_length": 1819.416748046875, "completions/min_length": 643.0, "completions/min_terminated_length": 643.0, "epoch": 0.37347354138398914, "frac_reward_zero_std": 0.5, "grad_norm": 0.08507302403450012, "kl": 0.0, "learning_rate": 3.188405797101449e-07, "loss": -0.0004, "num_tokens": 36397317.0, "reward": 1.2666666507720947, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1101 }, { "completion_length": 1631.3333740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5713.0, "completions/mean_length": 2180.416748046875, "completions/mean_terminated_length": 1779.636474609375, "completions/min_length": 673.0, "completions/min_terminated_length": 673.0, "epoch": 0.37381275440976935, "frac_reward_zero_std": 0.0, "grad_norm": 0.551823616027832, "kl": NaN, "learning_rate": 3.1866804692891646e-07, "loss": 0.0072, "num_tokens": 36427135.0, "reward": 0.958333432674408, "reward_std": 0.2906581163406372, "rewards/correctness_reward_func/mean": 0.6833333969116211, "rewards/correctness_reward_func/std": 0.32427075505256653, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 1102 }, { "completion_length": 1906.666748046875, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 6370.0, "completions/mean_length": 3553.916748046875, "completions/mean_terminated_length": 2542.22216796875, "completions/min_length": 680.0, "completions/min_terminated_length": 680.0, "epoch": 0.3741519674355495, "frac_reward_zero_std": 0.5, "grad_norm": 0.1889905482530594, "kl": NaN, "learning_rate": 3.1849551414768806e-07, "loss": -0.031, "num_tokens": 36462495.0, "reward": 0.22500000894069672, "reward_std": 0.08215838670730591, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.22500000894069672, "rewards/format_reward_func/std": 0.13568010926246643, "step": 1103 }, { "completion_length": 2281.0000610351562, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 5542.0, "completions/mean_length": 3379.166748046875, "completions/mean_terminated_length": 2737.199951171875, "completions/min_length": 1081.0, "completions/min_terminated_length": 1081.0, "epoch": 0.37449118046132973, "frac_reward_zero_std": 0.5, "grad_norm": 0.2782169282436371, "kl": NaN, "learning_rate": 3.183229813664596e-07, "loss": -0.0208, "num_tokens": 36494913.0, "reward": 0.25, "reward_std": 0.0774596780538559, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.11677484214305878, "step": 1104 }, { "completion_length": 2244.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3560.0, "completions/max_terminated_length": 3560.0, "completions/mean_length": 2244.416748046875, "completions/mean_terminated_length": 2244.416748046875, "completions/min_length": 1373.0, "completions/min_terminated_length": 1373.0, "epoch": 0.3748303934871099, "frac_reward_zero_std": 1.0, "grad_norm": 1.5637188255368528e-07, "kl": 0.0, "learning_rate": 3.1815044858523117e-07, "loss": 0.0, "num_tokens": 36534866.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1105 }, { "completion_length": 3340.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 6082.0, "completions/max_terminated_length": 6082.0, "completions/mean_length": 3340.916748046875, "completions/mean_terminated_length": 3340.916748046875, "completions/min_length": 1797.0, "completions/min_terminated_length": 1797.0, "epoch": 0.3751696065128901, "frac_reward_zero_std": 0.5, "grad_norm": 0.7795282602310181, "kl": 0.0, "learning_rate": 3.179779158040027e-07, "loss": 0.0101, "num_tokens": 36590713.0, "reward": 0.6000000238418579, "reward_std": 0.2366432249546051, "rewards/correctness_reward_func/mean": 0.29999998211860657, "rewards/correctness_reward_func/std": 0.4472135901451111, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1106 }, { "completion_length": 1301.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1920.0, "completions/max_terminated_length": 1920.0, "completions/mean_length": 1301.916748046875, "completions/mean_terminated_length": 1301.916748046875, "completions/min_length": 637.0, "completions/min_terminated_length": 637.0, "epoch": 0.37550881953867027, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.178053830227743e-07, "loss": 0.0, "num_tokens": 36615180.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1107 }, { "completion_length": 2957.0833740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5362.0, "completions/mean_length": 3506.166748046875, "completions/mean_terminated_length": 3225.9091796875, "completions/min_length": 1504.0, "completions/min_terminated_length": 1504.0, "epoch": 0.3758480325644505, "frac_reward_zero_std": 0.0, "grad_norm": 1.1118133068084717, "kl": NaN, "learning_rate": 3.176328502415459e-07, "loss": 0.0003, "num_tokens": 36663931.0, "reward": 0.5875000357627869, "reward_std": 0.267261803150177, "rewards/correctness_reward_func/mean": 0.29999998211860657, "rewards/correctness_reward_func/std": 0.4472135901451111, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 1108 }, { "completion_length": 2553.166748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5375.0, "completions/mean_length": 3102.25, "completions/mean_terminated_length": 2785.272705078125, "completions/min_length": 770.0, "completions/min_terminated_length": 770.0, "epoch": 0.37618724559023065, "frac_reward_zero_std": 0.5, "grad_norm": 0.15863870084285736, "kl": NaN, "learning_rate": 3.1746031746031743e-07, "loss": 0.0309, "num_tokens": 36704739.0, "reward": 0.7541667819023132, "reward_std": 0.13268069922924042, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 1109 }, { "completion_length": 475.0833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 734.0, "completions/max_terminated_length": 734.0, "completions/mean_length": 475.0833435058594, "completions/mean_terminated_length": 475.0833435058594, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.37652645861601086, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.17287784679089e-07, "loss": 0.0, "num_tokens": 36725722.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1110 }, { "completion_length": 1494.3333740234375, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 5273.0, "completions/mean_length": 2592.5, "completions/mean_terminated_length": 1793.2000732421875, "completions/min_length": 486.0, "completions/min_terminated_length": 486.0, "epoch": 0.376865671641791, "frac_reward_zero_std": 0.5, "grad_norm": 0.8331822156906128, "kl": NaN, "learning_rate": 3.171152518978606e-07, "loss": -0.0695, "num_tokens": 36753854.0, "reward": 0.533333420753479, "reward_std": 0.29944396018981934, "rewards/correctness_reward_func/mean": 0.28333333134651184, "rewards/correctness_reward_func/std": 0.4217568039894104, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.11677484214305878, "step": 1111 }, { "completion_length": 2044.0833740234375, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 5376.0, "completions/mean_length": 3142.25, "completions/mean_terminated_length": 2452.900146484375, "completions/min_length": 743.0, "completions/min_terminated_length": 743.0, "epoch": 0.37720488466757124, "frac_reward_zero_std": 0.5, "grad_norm": 0.1642346829175949, "kl": NaN, "learning_rate": 3.169427191166322e-07, "loss": -0.0204, "num_tokens": 36787923.0, "reward": 0.6500000953674316, "reward_std": 0.0774596780538559, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.11677484214305878, "step": 1112 }, { "completion_length": 1897.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3919.0, "completions/max_terminated_length": 3919.0, "completions/mean_length": 1897.3333740234375, "completions/mean_terminated_length": 1897.3333740234375, "completions/min_length": 715.0, "completions/min_terminated_length": 715.0, "epoch": 0.3775440976933514, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.167701863354037e-07, "loss": 0.0, "num_tokens": 36819997.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1113 }, { "completion_length": 1237.5000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 2895.0, "completions/max_terminated_length": 2895.0, "completions/mean_length": 1237.5, "completions/mean_terminated_length": 1237.5, "completions/min_length": 504.0, "completions/min_terminated_length": 504.0, "epoch": 0.3778833107191316, "frac_reward_zero_std": 0.5, "grad_norm": 0.06216409429907799, "kl": 0.0, "learning_rate": 3.165976535541753e-07, "loss": 0.0016, "num_tokens": 36845767.0, "reward": 1.2166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298573225736618, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1114 }, { "completion_length": 1420.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 2140.0, "completions/max_terminated_length": 2140.0, "completions/mean_length": 1420.416748046875, "completions/mean_terminated_length": 1420.416748046875, "completions/min_length": 671.0, "completions/min_terminated_length": 671.0, "epoch": 0.37822252374491183, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.1642512077294685e-07, "loss": 0.0, "num_tokens": 36878856.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1115 }, { "completion_length": 1458.9166870117188, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5461.0, "completions/mean_length": 2008.0, "completions/mean_terminated_length": 1591.5455322265625, "completions/min_length": 679.0, "completions/min_terminated_length": 679.0, "epoch": 0.378561736770692, "frac_reward_zero_std": 0.5, "grad_norm": 0.09449256211519241, "kl": NaN, "learning_rate": 3.162525879917184e-07, "loss": -0.0079, "num_tokens": 36907043.0, "reward": 0.7749999761581421, "reward_std": 0.06123724579811096, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 1116 }, { "completion_length": 1281.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2674.0, "completions/max_terminated_length": 2674.0, "completions/mean_length": 1281.0833740234375, "completions/mean_terminated_length": 1281.0833740234375, "completions/min_length": 648.0, "completions/min_terminated_length": 648.0, "epoch": 0.3789009497964722, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.1608005521048996e-07, "loss": 0.0, "num_tokens": 36936186.0, "reward": 0.30000001192092896, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1117 }, { "completion_length": 989.6667175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 1424.0, "completions/max_terminated_length": 1424.0, "completions/mean_length": 989.6666870117188, "completions/mean_terminated_length": 989.6666870117188, "completions/min_length": 615.0, "completions/min_terminated_length": 615.0, "epoch": 0.37924016282225237, "frac_reward_zero_std": 0.5, "grad_norm": 0.05201732739806175, "kl": 0.0, "learning_rate": 3.1590752242926157e-07, "loss": -0.0008, "num_tokens": 36956900.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1118 }, { "completion_length": 1812.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4020.0, "completions/max_terminated_length": 4020.0, "completions/mean_length": 1812.3333740234375, "completions/mean_terminated_length": 1812.3333740234375, "completions/min_length": 498.0, "completions/min_terminated_length": 498.0, "epoch": 0.3795793758480326, "frac_reward_zero_std": 0.0, "grad_norm": 0.6742749810218811, "kl": 0.0, "learning_rate": 3.157349896480331e-07, "loss": -0.0038, "num_tokens": 36990696.0, "reward": 1.1000001430511475, "reward_std": 0.3098386526107788, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.38138505816459656, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1119 }, { "completion_length": 2011.25, "completions/clipped_ratio": 0.0, "completions/max_length": 6380.0, "completions/max_terminated_length": 6380.0, "completions/mean_length": 2011.25, "completions/mean_terminated_length": 2011.25, "completions/min_length": 1041.0, "completions/min_terminated_length": 1041.0, "epoch": 0.37991858887381275, "frac_reward_zero_std": 1.0, "grad_norm": 1.1849378012129819e-07, "kl": 0.0, "learning_rate": 3.1556245686680467e-07, "loss": 0.0, "num_tokens": 37031973.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1120 }, { "completion_length": 1081.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 6422.0, "completions/max_terminated_length": 6422.0, "completions/mean_length": 1081.8333740234375, "completions/mean_terminated_length": 1081.8333740234375, "completions/min_length": 430.0, "completions/min_terminated_length": 430.0, "epoch": 0.38025780189959296, "frac_reward_zero_std": 0.5, "grad_norm": 0.6227987408638, "kl": 0.0, "learning_rate": 3.153899240855762e-07, "loss": 0.0588, "num_tokens": 37058191.0, "reward": 1.2000000476837158, "reward_std": 0.20000001788139343, "rewards/correctness_reward_func/mean": 0.9000000357627869, "rewards/correctness_reward_func/std": 0.28919950127601624, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1121 }, { "completion_length": 716.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2709.0, "completions/max_terminated_length": 2709.0, "completions/mean_length": 716.4166870117188, "completions/mean_terminated_length": 716.4166870117188, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.3805970149253731, "frac_reward_zero_std": 0.5, "grad_norm": 0.06783906370401382, "kl": 0.0, "learning_rate": 3.1521739130434783e-07, "loss": -0.0001, "num_tokens": 37079796.0, "reward": 0.75, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.44999995827674866, "rewards/correctness_reward_func/std": 0.47577688097953796, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1122 }, { "completion_length": 1906.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3933.0, "completions/max_terminated_length": 3933.0, "completions/mean_length": 1906.5, "completions/mean_terminated_length": 1906.5, "completions/min_length": 870.0, "completions/min_terminated_length": 870.0, "epoch": 0.38093622795115334, "frac_reward_zero_std": 1.0, "grad_norm": 1.9644974713628471e-07, "kl": 0.0, "learning_rate": 3.150448585231194e-07, "loss": 0.0, "num_tokens": 37115316.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1123 }, { "completion_length": 1349.3333740234375, "completions/clipped_ratio": 0.33333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6095.0, "completions/mean_length": 3545.666748046875, "completions/mean_terminated_length": 2024.0, "completions/min_length": 589.0, "completions/min_terminated_length": 589.0, "epoch": 0.3812754409769335, "frac_reward_zero_std": 0.5, "grad_norm": 0.14500510692596436, "kl": NaN, "learning_rate": 3.1487232574189094e-07, "loss": -0.0217, "num_tokens": 37139212.0, "reward": 0.7250000238418579, "reward_std": 0.08215838670730591, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.22500000894069672, "rewards/format_reward_func/std": 0.13568010926246643, "step": 1124 }, { "completion_length": 1195.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2540.0, "completions/max_terminated_length": 2540.0, "completions/mean_length": 1195.5, "completions/mean_terminated_length": 1195.5, "completions/min_length": 449.0, "completions/min_terminated_length": 449.0, "epoch": 0.3816146540027137, "frac_reward_zero_std": 0.5, "grad_norm": 0.09429711103439331, "kl": 0.0, "learning_rate": 3.146997929606625e-07, "loss": -0.0, "num_tokens": 37165168.0, "reward": 1.2166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1125 }, { "completion_length": 1405.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 5481.0, "completions/max_terminated_length": 5481.0, "completions/mean_length": 1405.916748046875, "completions/mean_terminated_length": 1405.916748046875, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.3819538670284939, "frac_reward_zero_std": 0.5, "grad_norm": 0.10915450751781464, "kl": 0.0, "learning_rate": 3.145272601794341e-07, "loss": 0.0075, "num_tokens": 37196019.0, "reward": 1.2333333492279053, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.9333333373069763, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1126 }, { "completion_length": 579.8333587646484, "completions/clipped_ratio": 0.5833333333333333, "completions/max_length": 6589.0, "completions/max_terminated_length": 1833.0, "completions/mean_length": 4423.4169921875, "completions/mean_terminated_length": 1391.5999755859375, "completions/min_length": 945.0, "completions/min_terminated_length": 945.0, "epoch": 0.3822930800542741, "frac_reward_zero_std": 0.5, "grad_norm": 0.19408124685287476, "kl": NaN, "learning_rate": 3.1435472739820565e-07, "loss": -0.0191, "num_tokens": 37214539.0, "reward": 0.5416666865348816, "reward_std": 0.26536139845848083, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.5149286985397339, "rewards/format_reward_func/mean": 0.125, "rewards/format_reward_func/std": 0.15447859466075897, "step": 1127 }, { "completion_length": 1045.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1632.0, "completions/max_terminated_length": 1632.0, "completions/mean_length": 1045.8333740234375, "completions/mean_terminated_length": 1045.8333740234375, "completions/min_length": 581.0, "completions/min_terminated_length": 581.0, "epoch": 0.38263229308005425, "frac_reward_zero_std": 0.5, "grad_norm": 0.07425491511821747, "kl": 0.0, "learning_rate": 3.141821946169772e-07, "loss": -0.0003, "num_tokens": 37238945.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1128 }, { "completion_length": 1301.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 4797.0, "completions/max_terminated_length": 4797.0, "completions/mean_length": 1301.916748046875, "completions/mean_terminated_length": 1301.916748046875, "completions/min_length": 629.0, "completions/min_terminated_length": 629.0, "epoch": 0.38297150610583447, "frac_reward_zero_std": 0.5, "grad_norm": 0.2052173614501953, "kl": 0.0, "learning_rate": 3.140096618357488e-07, "loss": -0.0061, "num_tokens": 37264384.0, "reward": 1.133333444595337, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1129 }, { "completion_length": 1817.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3552.0, "completions/max_terminated_length": 3552.0, "completions/mean_length": 1817.5, "completions/mean_terminated_length": 1817.5, "completions/min_length": 647.0, "completions/min_terminated_length": 647.0, "epoch": 0.38331071913161463, "frac_reward_zero_std": 0.0, "grad_norm": 0.5269929766654968, "kl": 0.0, "learning_rate": 3.1383712905452036e-07, "loss": 0.0255, "num_tokens": 37299646.0, "reward": 1.0500000715255737, "reward_std": 0.29902371764183044, "rewards/correctness_reward_func/mean": 0.75, "rewards/correctness_reward_func/std": 0.36306774616241455, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1130 }, { "completion_length": 455.25, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/max_terminated_length": 523.0, "completions/mean_length": 455.25, "completions/mean_terminated_length": 455.25, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "epoch": 0.38364993215739485, "frac_reward_zero_std": 0.5, "grad_norm": 0.051706839352846146, "kl": 0.0, "learning_rate": 3.136645962732919e-07, "loss": -0.0002, "num_tokens": 37315429.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1131 }, { "completion_length": 687.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 840.0, "completions/max_terminated_length": 840.0, "completions/mean_length": 687.6666870117188, "completions/mean_terminated_length": 687.6666870117188, "completions/min_length": 394.0, "completions/min_terminated_length": 394.0, "epoch": 0.383989145183175, "frac_reward_zero_std": 1.0, "grad_norm": 2.443991604650364e-07, "kl": 0.0, "learning_rate": 3.1349206349206346e-07, "loss": 0.0, "num_tokens": 37334109.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1132 }, { "completion_length": 1926.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 4221.0, "completions/max_terminated_length": 4221.0, "completions/mean_length": 1926.5, "completions/mean_terminated_length": 1926.5, "completions/min_length": 742.0, "completions/min_terminated_length": 742.0, "epoch": 0.3843283582089552, "frac_reward_zero_std": 0.5, "grad_norm": 0.10946495085954666, "kl": 0.0, "learning_rate": 3.1331953071083507e-07, "loss": 0.0009, "num_tokens": 37363215.0, "reward": 1.1500000953674316, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.8500000834465027, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1133 }, { "completion_length": 1685.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3747.0, "completions/max_terminated_length": 3747.0, "completions/mean_length": 1685.666748046875, "completions/mean_terminated_length": 1685.666748046875, "completions/min_length": 847.0, "completions/min_terminated_length": 847.0, "epoch": 0.38466757123473544, "frac_reward_zero_std": 1.0, "grad_norm": 1.2391146242407558e-07, "kl": 0.0, "learning_rate": 3.131469979296066e-07, "loss": 0.0, "num_tokens": 37394957.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1134 }, { "completion_length": 497.91668701171875, "completions/clipped_ratio": 0.8333333333333334, "completions/max_length": 6589.0, "completions/max_terminated_length": 3039.0, "completions/mean_length": 5988.75, "completions/mean_terminated_length": 2987.5, "completions/min_length": 2936.0, "completions/min_terminated_length": 2936.0, "epoch": 0.3850067842605156, "frac_reward_zero_std": 0.5, "grad_norm": 0.5248692035675049, "kl": NaN, "learning_rate": 3.129744651483782e-07, "loss": -0.0535, "num_tokens": 37411210.0, "reward": 0.19583332538604736, "reward_std": 0.27586984634399414, "rewards/correctness_reward_func/mean": 0.13333334028720856, "rewards/correctness_reward_func/std": 0.3113996088504791, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.11894422769546509, "step": 1135 }, { "completion_length": 725.2500305175781, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 1536.0, "completions/mean_length": 1274.3333740234375, "completions/mean_terminated_length": 791.1818237304688, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "epoch": 0.3853459972862958, "frac_reward_zero_std": 0.5, "grad_norm": 0.0961931124329567, "kl": NaN, "learning_rate": 3.1280193236714973e-07, "loss": -0.0031, "num_tokens": 37427827.0, "reward": 0.7749999761581421, "reward_std": 0.06123724579811096, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 1136 }, { "completion_length": 727.8333435058594, "completions/clipped_ratio": 0.5, "completions/max_length": 6589.0, "completions/max_terminated_length": 1890.0, "completions/mean_length": 4022.33349609375, "completions/mean_terminated_length": 1455.666748046875, "completions/min_length": 932.0, "completions/min_terminated_length": 932.0, "epoch": 0.385685210312076, "frac_reward_zero_std": 0.5, "grad_norm": 0.42210739850997925, "kl": NaN, "learning_rate": 3.1262939958592133e-07, "loss": -0.0066, "num_tokens": 37447373.0, "reward": 0.5666667222976685, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.5149286985397339, "rewards/format_reward_func/mean": 0.15000000596046448, "rewards/format_reward_func/std": 0.15666989982128143, "step": 1137 }, { "completion_length": 3745.916748046875, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 6374.0, "completions/mean_length": 4844.08349609375, "completions/mean_terminated_length": 4495.10009765625, "completions/min_length": 2374.0, "completions/min_terminated_length": 2374.0, "epoch": 0.3860244233378562, "frac_reward_zero_std": 0.0, "grad_norm": 0.8837997317314148, "kl": NaN, "learning_rate": 3.1245686680469283e-07, "loss": -0.0914, "num_tokens": 37503778.0, "reward": 0.6833333373069763, "reward_std": 0.5422176122665405, "rewards/correctness_reward_func/mean": 0.43333330750465393, "rewards/correctness_reward_func/std": 0.45792683959007263, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.11677484214305878, "step": 1138 }, { "completion_length": 847.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1306.0, "completions/max_terminated_length": 1306.0, "completions/mean_length": 847.3333740234375, "completions/mean_terminated_length": 847.3333740234375, "completions/min_length": 431.0, "completions/min_terminated_length": 431.0, "epoch": 0.38636363636363635, "frac_reward_zero_std": 0.5, "grad_norm": 0.0851774737238884, "kl": 0.0, "learning_rate": 3.1228433402346444e-07, "loss": 0.0, "num_tokens": 37524560.0, "reward": 1.2666666507720947, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1139 }, { "completion_length": 1713.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 4090.0, "completions/max_terminated_length": 4090.0, "completions/mean_length": 1713.166748046875, "completions/mean_terminated_length": 1713.166748046875, "completions/min_length": 667.0, "completions/min_terminated_length": 667.0, "epoch": 0.38670284938941657, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.12111801242236e-07, "loss": 0.0, "num_tokens": 37558282.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1140 }, { "completion_length": 890.0833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 2244.0, "completions/max_terminated_length": 2244.0, "completions/mean_length": 890.0833740234375, "completions/mean_terminated_length": 890.0833740234375, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "epoch": 0.38704206241519673, "frac_reward_zero_std": 1.0, "grad_norm": 2.4239955109806033e-07, "kl": 0.0, "learning_rate": 3.119392684610076e-07, "loss": 0.0, "num_tokens": 37578677.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1141 }, { "completion_length": 1182.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 1753.0, "completions/max_terminated_length": 1753.0, "completions/mean_length": 1182.75, "completions/mean_terminated_length": 1182.75, "completions/min_length": 569.0, "completions/min_terminated_length": 569.0, "epoch": 0.38738127544097695, "frac_reward_zero_std": 0.5, "grad_norm": 0.09648097306489944, "kl": 0.0, "learning_rate": 3.117667356797791e-07, "loss": -0.0001, "num_tokens": 37604372.0, "reward": 0.7666666507720947, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1142 }, { "completion_length": 908.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2136.0, "completions/max_terminated_length": 2136.0, "completions/mean_length": 908.8333740234375, "completions/mean_terminated_length": 908.8333740234375, "completions/min_length": 495.0, "completions/min_terminated_length": 495.0, "epoch": 0.3877204884667571, "frac_reward_zero_std": 0.5, "grad_norm": 0.04885704442858696, "kl": 0.0, "learning_rate": 3.115942028985507e-07, "loss": -0.0002, "num_tokens": 37631754.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1143 }, { "completion_length": 2577.5834350585938, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5271.0, "completions/mean_length": 3126.666748046875, "completions/mean_terminated_length": 2811.9091796875, "completions/min_length": 1267.0, "completions/min_terminated_length": 1267.0, "epoch": 0.3880597014925373, "frac_reward_zero_std": 0.0, "grad_norm": 0.6010740995407104, "kl": NaN, "learning_rate": 3.114216701173223e-07, "loss": -0.0342, "num_tokens": 37674331.0, "reward": 1.0916666984558105, "reward_std": 0.34151846170425415, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.3857303261756897, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 1144 }, { "completion_length": 947.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1607.0, "completions/max_terminated_length": 1607.0, "completions/mean_length": 947.0833740234375, "completions/mean_terminated_length": 947.0833740234375, "completions/min_length": 512.0, "completions/min_terminated_length": 512.0, "epoch": 0.3883989145183175, "frac_reward_zero_std": 1.0, "grad_norm": 1.0952752660386977e-07, "kl": 0.0, "learning_rate": 3.1124913733609386e-07, "loss": 0.0, "num_tokens": 37697258.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1145 }, { "completion_length": 939.8333740234375, "completions/clipped_ratio": 0.33333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4103.0, "completions/mean_length": 3136.166748046875, "completions/mean_terminated_length": 1409.75, "completions/min_length": 520.0, "completions/min_terminated_length": 520.0, "epoch": 0.3887381275440977, "frac_reward_zero_std": 0.5, "grad_norm": 0.16772747039794922, "kl": NaN, "learning_rate": 3.110766045548654e-07, "loss": -0.0168, "num_tokens": 37721538.0, "reward": 0.6000000834465027, "reward_std": 0.0774596780538559, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.14770980179309845, "step": 1146 }, { "completion_length": 1370.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2354.0, "completions/max_terminated_length": 2354.0, "completions/mean_length": 1370.5833740234375, "completions/mean_terminated_length": 1370.5833740234375, "completions/min_length": 839.0, "completions/min_terminated_length": 839.0, "epoch": 0.38907734056987786, "frac_reward_zero_std": 1.0, "grad_norm": 1.3077035987407726e-07, "kl": 0.0, "learning_rate": 3.1090407177363697e-07, "loss": 0.0, "num_tokens": 37754431.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1147 }, { "completion_length": 2758.75, "completions/clipped_ratio": 0.41666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 6259.0, "completions/mean_length": 5504.1669921875, "completions/mean_terminated_length": 4729.2861328125, "completions/min_length": 1880.0, "completions/min_terminated_length": 1880.0, "epoch": 0.3894165535956581, "frac_reward_zero_std": 0.0, "grad_norm": 1.2581778764724731, "kl": NaN, "learning_rate": 3.107315389924086e-07, "loss": -0.1544, "num_tokens": 37799992.0, "reward": 0.49166667461395264, "reward_std": 0.5691067576408386, "rewards/correctness_reward_func/mean": 0.3166666626930237, "rewards/correctness_reward_func/std": 0.4706539809703827, "rewards/format_reward_func/mean": 0.17499999701976776, "rewards/format_reward_func/std": 0.15447859466075897, "step": 1148 }, { "completion_length": 2105.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 4608.0, "completions/max_terminated_length": 4608.0, "completions/mean_length": 2105.166748046875, "completions/mean_terminated_length": 2105.166748046875, "completions/min_length": 674.0, "completions/min_terminated_length": 674.0, "epoch": 0.38975576662143824, "frac_reward_zero_std": 0.5, "grad_norm": 0.5437031388282776, "kl": 0.0, "learning_rate": 3.105590062111801e-07, "loss": -0.0071, "num_tokens": 37837686.0, "reward": 0.9000000953674316, "reward_std": 0.21908903121948242, "rewards/correctness_reward_func/mean": 0.5999999642372131, "rewards/correctness_reward_func/std": 0.36181363463401794, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1149 }, { "completion_length": 1609.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 5142.0, "completions/max_terminated_length": 5142.0, "completions/mean_length": 1609.8333740234375, "completions/mean_terminated_length": 1609.8333740234375, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "epoch": 0.39009497964721845, "frac_reward_zero_std": 0.5, "grad_norm": 0.05529695004224777, "kl": 0.0, "learning_rate": 3.103864734299517e-07, "loss": -0.0002, "num_tokens": 37867546.0, "reward": 0.7833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1150 }, { "completion_length": 1237.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3243.0, "completions/max_terminated_length": 3243.0, "completions/mean_length": 1237.166748046875, "completions/mean_terminated_length": 1237.166748046875, "completions/min_length": 565.0, "completions/min_terminated_length": 565.0, "epoch": 0.39043419267299867, "frac_reward_zero_std": 1.0, "grad_norm": 1.1227295715343644e-07, "kl": 0.0, "learning_rate": 3.1021394064872323e-07, "loss": 0.0, "num_tokens": 37888242.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1151 }, { "completion_length": 1254.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3672.0, "completions/max_terminated_length": 3672.0, "completions/mean_length": 1254.166748046875, "completions/mean_terminated_length": 1254.166748046875, "completions/min_length": 485.0, "completions/min_terminated_length": 485.0, "epoch": 0.39077340569877883, "frac_reward_zero_std": 1.0, "grad_norm": 2.1543176842442335e-07, "kl": 0.0, "learning_rate": 3.1004140786749484e-07, "loss": 0.0, "num_tokens": 37914998.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1152 }, { "completion_length": 2654.0001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 5423.0, "completions/max_terminated_length": 5423.0, "completions/mean_length": 2654.0, "completions/mean_terminated_length": 2654.0, "completions/min_length": 593.0, "completions/min_terminated_length": 593.0, "epoch": 0.39111261872455905, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.0986887508626634e-07, "loss": 0.0, "num_tokens": 37955816.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1153 }, { "completion_length": 765.0833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1269.0, "completions/max_terminated_length": 1269.0, "completions/mean_length": 765.0833740234375, "completions/mean_terminated_length": 765.0833740234375, "completions/min_length": 366.0, "completions/min_terminated_length": 366.0, "epoch": 0.3914518317503392, "frac_reward_zero_std": 1.0, "grad_norm": 1.1394226362426707e-07, "kl": 0.0, "learning_rate": 3.0969634230503794e-07, "loss": 0.0, "num_tokens": 37979055.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1154 }, { "completion_length": 1868.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 5260.0, "completions/max_terminated_length": 5260.0, "completions/mean_length": 1868.0833740234375, "completions/mean_terminated_length": 1868.0833740234375, "completions/min_length": 657.0, "completions/min_terminated_length": 657.0, "epoch": 0.3917910447761194, "frac_reward_zero_std": 0.5, "grad_norm": 0.4059426784515381, "kl": 0.0, "learning_rate": 3.095238095238095e-07, "loss": 0.0072, "num_tokens": 38011186.0, "reward": 0.36666667461395264, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.06666667014360428, "rewards/correctness_reward_func/std": 0.23094011843204498, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1155 }, { "completion_length": 852.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2148.0, "completions/max_terminated_length": 2148.0, "completions/mean_length": 852.1666870117188, "completions/mean_terminated_length": 852.1666870117188, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "epoch": 0.3921302578018996, "frac_reward_zero_std": 1.0, "grad_norm": 7.792284861807275e-08, "kl": 0.0, "learning_rate": 3.093512767425811e-07, "loss": 0.0, "num_tokens": 38031264.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1156 }, { "completion_length": 3239.2501220703125, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5728.0, "completions/mean_length": 3788.33349609375, "completions/mean_terminated_length": 3533.727294921875, "completions/min_length": 1293.0, "completions/min_terminated_length": 1293.0, "epoch": 0.3924694708276798, "frac_reward_zero_std": 0.0, "grad_norm": 0.7937850952148438, "kl": NaN, "learning_rate": 3.091787439613526e-07, "loss": -0.0418, "num_tokens": 38080491.0, "reward": 0.3583333194255829, "reward_std": 0.26536136865615845, "rewards/correctness_reward_func/mean": 0.0833333358168602, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 1157 }, { "completion_length": 2118.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5848.0, "completions/max_terminated_length": 5848.0, "completions/mean_length": 2118.916748046875, "completions/mean_terminated_length": 2118.916748046875, "completions/min_length": 535.0, "completions/min_terminated_length": 535.0, "epoch": 0.39280868385345996, "frac_reward_zero_std": 0.0, "grad_norm": 0.88248211145401, "kl": 0.0, "learning_rate": 3.090062111801242e-07, "loss": 0.025, "num_tokens": 38115314.0, "reward": 1.066666841506958, "reward_std": 0.3801923394203186, "rewards/correctness_reward_func/mean": 0.7666666507720947, "rewards/correctness_reward_func/std": 0.3700941801071167, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1158 }, { "completion_length": 3405.666748046875, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 6264.0, "completions/mean_length": 5052.9169921875, "completions/mean_terminated_length": 4540.88916015625, "completions/min_length": 2788.0, "completions/min_terminated_length": 2788.0, "epoch": 0.3931478968792402, "frac_reward_zero_std": 0.5, "grad_norm": 0.5897723436355591, "kl": NaN, "learning_rate": 3.088336783988958e-07, "loss": -0.0832, "num_tokens": 38168254.0, "reward": 0.42500004172325134, "reward_std": 0.30124741792678833, "rewards/correctness_reward_func/mean": 0.20000000298023224, "rewards/correctness_reward_func/std": 0.36181363463401794, "rewards/format_reward_func/mean": 0.22500000894069672, "rewards/format_reward_func/std": 0.13568010926246643, "step": 1159 }, { "completion_length": 2761.916748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5114.0, "completions/mean_length": 3311.0, "completions/mean_terminated_length": 3013.0, "completions/min_length": 680.0, "completions/min_terminated_length": 680.0, "epoch": 0.39348710990502034, "frac_reward_zero_std": 0.5, "grad_norm": 0.36621418595314026, "kl": NaN, "learning_rate": 3.086611456176673e-07, "loss": -0.0413, "num_tokens": 38217081.0, "reward": 1.125, "reward_std": 0.23611438274383545, "rewards/correctness_reward_func/mean": 0.8499999642372131, "rewards/correctness_reward_func/std": 0.28444525599479675, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 1160 }, { "completion_length": 1509.3333740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 2844.0, "completions/mean_length": 2058.416748046875, "completions/mean_terminated_length": 1646.5455322265625, "completions/min_length": 650.0, "completions/min_terminated_length": 650.0, "epoch": 0.39382632293080055, "frac_reward_zero_std": 0.0, "grad_norm": 0.12216031551361084, "kl": NaN, "learning_rate": 3.084886128364389e-07, "loss": -0.008, "num_tokens": 38246293.0, "reward": 0.6916667819023132, "reward_std": 0.10206207633018494, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 1161 }, { "completion_length": 960.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2713.0, "completions/max_terminated_length": 2713.0, "completions/mean_length": 960.25, "completions/mean_terminated_length": 960.25, "completions/min_length": 527.0, "completions/min_terminated_length": 527.0, "epoch": 0.3941655359565807, "frac_reward_zero_std": 0.0, "grad_norm": 0.328845351934433, "kl": 0.0, "learning_rate": 3.0831608005521047e-07, "loss": -0.0032, "num_tokens": 38265982.0, "reward": 1.120833396911621, "reward_std": 0.27556759119033813, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.2806917726993561, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 1162 }, { "completion_length": 1225.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3911.0, "completions/max_terminated_length": 3911.0, "completions/mean_length": 1225.0833740234375, "completions/mean_terminated_length": 1225.0833740234375, "completions/min_length": 416.0, "completions/min_terminated_length": 416.0, "epoch": 0.39450474898236093, "frac_reward_zero_std": 0.0, "grad_norm": 0.15277113020420074, "kl": 0.0, "learning_rate": 3.081435472739821e-07, "loss": -0.006, "num_tokens": 38291057.0, "reward": 1.183333396911621, "reward_std": 0.10641199350357056, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1163 }, { "completion_length": 2266.5001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 5092.0, "completions/max_terminated_length": 5092.0, "completions/mean_length": 2266.5, "completions/mean_terminated_length": 2266.5, "completions/min_length": 455.0, "completions/min_terminated_length": 455.0, "epoch": 0.3948439620081411, "frac_reward_zero_std": 0.5, "grad_norm": 0.3576086759567261, "kl": 0.0, "learning_rate": 3.079710144927536e-07, "loss": 0.0091, "num_tokens": 38327105.0, "reward": 0.36666667461395264, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.06666667014360428, "rewards/correctness_reward_func/std": 0.23094011843204498, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1164 }, { "completion_length": 1151.6666717529297, "completions/clipped_ratio": 0.33333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5928.0, "completions/mean_length": 3348.0, "completions/mean_terminated_length": 1727.5, "completions/min_length": 572.0, "completions/min_terminated_length": 572.0, "epoch": 0.3951831750339213, "frac_reward_zero_std": 0.5, "grad_norm": 0.7642198801040649, "kl": NaN, "learning_rate": 3.077984817115252e-07, "loss": -0.0965, "num_tokens": 38353021.0, "reward": 0.8666666746139526, "reward_std": 0.3356585204601288, "rewards/correctness_reward_func/mean": 0.6666666865348816, "rewards/correctness_reward_func/std": 0.4923659861087799, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.14770980179309845, "step": 1165 }, { "completion_length": 1397.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3868.0, "completions/max_terminated_length": 3868.0, "completions/mean_length": 1397.8333740234375, "completions/mean_terminated_length": 1397.8333740234375, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "epoch": 0.39552238805970147, "frac_reward_zero_std": 0.5, "grad_norm": 0.10587406158447266, "kl": 0.0, "learning_rate": 3.0762594893029674e-07, "loss": 0.0004, "num_tokens": 38379989.0, "reward": 0.7833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1166 }, { "completion_length": 1057.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2549.0, "completions/max_terminated_length": 2549.0, "completions/mean_length": 1057.3333740234375, "completions/mean_terminated_length": 1057.3333740234375, "completions/min_length": 643.0, "completions/min_terminated_length": 643.0, "epoch": 0.3958616010854817, "frac_reward_zero_std": 1.0, "grad_norm": 2.1673270111932652e-07, "kl": 0.0, "learning_rate": 3.0745341614906834e-07, "loss": 0.0, "num_tokens": 38405397.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1167 }, { "completion_length": 1041.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2178.0, "completions/max_terminated_length": 2178.0, "completions/mean_length": 1041.8333740234375, "completions/mean_terminated_length": 1041.8333740234375, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.39620081411126185, "frac_reward_zero_std": 0.5, "grad_norm": 0.3525071144104004, "kl": 0.0, "learning_rate": 3.0728088336783984e-07, "loss": -0.0088, "num_tokens": 38428579.0, "reward": 1.2166666984558105, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1168 }, { "completion_length": 1290.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2267.0, "completions/max_terminated_length": 2267.0, "completions/mean_length": 1290.5, "completions/mean_terminated_length": 1290.5, "completions/min_length": 457.0, "completions/min_terminated_length": 457.0, "epoch": 0.39654002713704206, "frac_reward_zero_std": 0.5, "grad_norm": 0.05163000151515007, "kl": 0.0, "learning_rate": 3.0710835058661145e-07, "loss": -0.0004, "num_tokens": 38454457.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1169 }, { "completion_length": 1991.1666870117188, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 6390.0, "completions/mean_length": 3638.416748046875, "completions/mean_terminated_length": 2654.888916015625, "completions/min_length": 683.0, "completions/min_terminated_length": 683.0, "epoch": 0.3968792401628223, "frac_reward_zero_std": 0.0, "grad_norm": 0.7516310811042786, "kl": NaN, "learning_rate": 3.06935817805383e-07, "loss": -0.0879, "num_tokens": 38490039.0, "reward": 0.8083333969116211, "reward_std": 0.36240944266319275, "rewards/correctness_reward_func/mean": 0.5833333134651184, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.22500000894069672, "rewards/format_reward_func/std": 0.13568010926246643, "step": 1170 }, { "completion_length": 838.0833740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 2290.0, "completions/mean_length": 1387.166748046875, "completions/mean_terminated_length": 914.2727661132812, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.39721845318860244, "frac_reward_zero_std": 0.0, "grad_norm": 0.23873606324195862, "kl": NaN, "learning_rate": 3.0676328502415455e-07, "loss": -0.017, "num_tokens": 38511586.0, "reward": 1.125, "reward_std": 0.23662227392196655, "rewards/correctness_reward_func/mean": 0.8499999642372131, "rewards/correctness_reward_func/std": 0.28444525599479675, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 1171 }, { "completion_length": 3354.7501220703125, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 6475.0, "completions/mean_length": 5002.0, "completions/mean_terminated_length": 4473.0, "completions/min_length": 2455.0, "completions/min_terminated_length": 2455.0, "epoch": 0.39755766621438265, "frac_reward_zero_std": 0.0, "grad_norm": 0.25712013244628906, "kl": NaN, "learning_rate": 3.065907522429261e-07, "loss": -0.0311, "num_tokens": 38560819.0, "reward": 0.6750000715255737, "reward_std": 0.13693061470985413, "rewards/correctness_reward_func/mean": 0.44999995827674866, "rewards/correctness_reward_func/std": 0.47577688097953796, "rewards/format_reward_func/mean": 0.22500000894069672, "rewards/format_reward_func/std": 0.13568010926246643, "step": 1172 }, { "completion_length": 786.5000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1067.0, "completions/max_terminated_length": 1067.0, "completions/mean_length": 786.5, "completions/mean_terminated_length": 786.5, "completions/min_length": 481.0, "completions/min_terminated_length": 481.0, "epoch": 0.3978968792401628, "frac_reward_zero_std": 0.0, "grad_norm": 0.2868483364582062, "kl": 0.0, "learning_rate": 3.064182194616977e-07, "loss": 0.001, "num_tokens": 38581441.0, "reward": 0.8000000715255737, "reward_std": 0.21493908762931824, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.4472135901451111, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1173 }, { "completion_length": 1905.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3776.0, "completions/max_terminated_length": 3776.0, "completions/mean_length": 1905.8333740234375, "completions/mean_terminated_length": 1905.8333740234375, "completions/min_length": 943.0, "completions/min_terminated_length": 943.0, "epoch": 0.39823609226594303, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.062456866804693e-07, "loss": 0.0, "num_tokens": 38615231.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1174 }, { "completion_length": 942.0833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 2344.0, "completions/max_terminated_length": 2344.0, "completions/mean_length": 942.0833740234375, "completions/mean_terminated_length": 942.0833740234375, "completions/min_length": 387.0, "completions/min_terminated_length": 387.0, "epoch": 0.3985753052917232, "frac_reward_zero_std": 0.5, "grad_norm": 0.39738231897354126, "kl": 0.0, "learning_rate": 3.060731538992408e-07, "loss": -0.0127, "num_tokens": 38637870.0, "reward": 1.1000001430511475, "reward_std": 0.23664319515228271, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.38138505816459656, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1175 }, { "completion_length": 3275.33349609375, "completions/clipped_ratio": 0.0, "completions/max_length": 6384.0, "completions/max_terminated_length": 6384.0, "completions/mean_length": 3275.33349609375, "completions/mean_terminated_length": 3275.33349609375, "completions/min_length": 1169.0, "completions/min_terminated_length": 1169.0, "epoch": 0.3989145183175034, "frac_reward_zero_std": 0.0, "grad_norm": 1.0210018157958984, "kl": 0.0, "learning_rate": 3.059006211180124e-07, "loss": 0.0784, "num_tokens": 38690500.0, "reward": 0.6333333253860474, "reward_std": 0.36985844373703003, "rewards/correctness_reward_func/mean": 0.3333333432674408, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1176 }, { "completion_length": 1131.0833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 2485.0, "completions/max_terminated_length": 2485.0, "completions/mean_length": 1131.0833740234375, "completions/mean_terminated_length": 1131.0833740234375, "completions/min_length": 481.0, "completions/min_terminated_length": 481.0, "epoch": 0.39925373134328357, "frac_reward_zero_std": 0.5, "grad_norm": 0.1369536817073822, "kl": 0.0, "learning_rate": 3.05728088336784e-07, "loss": 0.0002, "num_tokens": 38714027.0, "reward": 0.75, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.44999995827674866, "rewards/correctness_reward_func/std": 0.47577688097953796, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1177 }, { "completion_length": 1086.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1812.0, "completions/max_terminated_length": 1812.0, "completions/mean_length": 1086.666748046875, "completions/mean_terminated_length": 1086.666748046875, "completions/min_length": 468.0, "completions/min_terminated_length": 468.0, "epoch": 0.3995929443690638, "frac_reward_zero_std": 1.0, "grad_norm": 1.0610715150960459e-07, "kl": 0.0, "learning_rate": 3.055555555555556e-07, "loss": 0.0, "num_tokens": 38737201.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1178 }, { "completion_length": 330.33333587646484, "completions/clipped_ratio": 0.5, "completions/max_length": 6589.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 3624.83349609375, "completions/mean_terminated_length": 660.6666870117188, "completions/min_length": 475.0, "completions/min_terminated_length": 475.0, "epoch": 0.39993215739484395, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": NaN, "learning_rate": 3.053830227743271e-07, "loss": 0.0, "num_tokens": 38758193.0, "reward": 0.15000000596046448, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.15000000596046448, "rewards/format_reward_func/std": 0.15666989982128143, "step": 1179 }, { "completion_length": 922.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1362.0, "completions/max_terminated_length": 1362.0, "completions/mean_length": 922.8333740234375, "completions/mean_terminated_length": 922.8333740234375, "completions/min_length": 710.0, "completions/min_terminated_length": 710.0, "epoch": 0.40027137042062416, "frac_reward_zero_std": 1.0, "grad_norm": 1.57872079853405e-07, "kl": 0.0, "learning_rate": 3.052104899930987e-07, "loss": 0.0, "num_tokens": 38778675.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1180 }, { "completion_length": 1437.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3430.0, "completions/max_terminated_length": 3430.0, "completions/mean_length": 1437.3333740234375, "completions/mean_terminated_length": 1437.3333740234375, "completions/min_length": 503.0, "completions/min_terminated_length": 503.0, "epoch": 0.4006105834464043, "frac_reward_zero_std": 1.0, "grad_norm": 1.2580862573940976e-07, "kl": 0.0, "learning_rate": 3.0503795721187024e-07, "loss": 0.0, "num_tokens": 38810479.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1181 }, { "completion_length": 981.9167175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 1712.0, "completions/max_terminated_length": 1712.0, "completions/mean_length": 981.9166870117188, "completions/mean_terminated_length": 981.9166870117188, "completions/min_length": 606.0, "completions/min_terminated_length": 606.0, "epoch": 0.40094979647218454, "frac_reward_zero_std": 0.5, "grad_norm": 0.07529390603303909, "kl": 0.0, "learning_rate": 3.048654244306418e-07, "loss": -0.0003, "num_tokens": 38835294.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1182 }, { "completion_length": 2883.0001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 5593.0, "completions/max_terminated_length": 5593.0, "completions/mean_length": 2883.0, "completions/mean_terminated_length": 2883.0, "completions/min_length": 837.0, "completions/min_terminated_length": 837.0, "epoch": 0.4012890094979647, "frac_reward_zero_std": 0.5, "grad_norm": 0.0907670110464096, "kl": 0.0, "learning_rate": 3.0469289164941335e-07, "loss": -0.0009, "num_tokens": 38882040.0, "reward": 0.7166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1183 }, { "completion_length": 943.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1486.0, "completions/max_terminated_length": 1486.0, "completions/mean_length": 943.25, "completions/mean_terminated_length": 943.25, "completions/min_length": 646.0, "completions/min_terminated_length": 646.0, "epoch": 0.4016282225237449, "frac_reward_zero_std": 1.0, "grad_norm": 1.4570483131137735e-07, "kl": 0.0, "learning_rate": 3.0452035886818495e-07, "loss": 0.0, "num_tokens": 38904177.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1184 }, { "completion_length": 2253.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4281.0, "completions/max_terminated_length": 4281.0, "completions/mean_length": 2253.666748046875, "completions/mean_terminated_length": 2253.666748046875, "completions/min_length": 1279.0, "completions/min_terminated_length": 1279.0, "epoch": 0.4019674355495251, "frac_reward_zero_std": 0.5, "grad_norm": 0.6932840943336487, "kl": 0.0, "learning_rate": 3.043478260869565e-07, "loss": 0.0198, "num_tokens": 38943323.0, "reward": 1.149999976158142, "reward_std": 0.17606817185878754, "rewards/correctness_reward_func/mean": 0.8499999642372131, "rewards/correctness_reward_func/std": 0.28444522619247437, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1185 }, { "completion_length": 1125.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2721.0, "completions/max_terminated_length": 2721.0, "completions/mean_length": 1125.25, "completions/mean_terminated_length": 1125.25, "completions/min_length": 467.0, "completions/min_terminated_length": 467.0, "epoch": 0.4023066485753053, "frac_reward_zero_std": 0.5, "grad_norm": 0.5989533066749573, "kl": 0.0, "learning_rate": 3.0417529330572806e-07, "loss": 0.0174, "num_tokens": 38966444.0, "reward": 1.1166667938232422, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.8166666030883789, "rewards/correctness_reward_func/std": 0.27579087018966675, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1186 }, { "completion_length": 1602.4166870117188, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 4904.0, "completions/mean_length": 2700.58349609375, "completions/mean_terminated_length": 1922.9000244140625, "completions/min_length": 542.0, "completions/min_terminated_length": 542.0, "epoch": 0.4026458616010855, "frac_reward_zero_std": 0.5, "grad_norm": 0.5411194562911987, "kl": NaN, "learning_rate": 3.040027605244996e-07, "loss": -0.0434, "num_tokens": 39001783.0, "reward": 0.4791666865348816, "reward_std": 0.28478795289993286, "rewards/correctness_reward_func/mean": 0.21666665375232697, "rewards/correctness_reward_func/std": 0.39504510164260864, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.09323723614215851, "step": 1187 }, { "completion_length": 1195.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3239.0, "completions/max_terminated_length": 3239.0, "completions/mean_length": 1195.5, "completions/mean_terminated_length": 1195.5, "completions/min_length": 421.0, "completions/min_terminated_length": 421.0, "epoch": 0.40298507462686567, "frac_reward_zero_std": 1.0, "grad_norm": 1.5577973044855753e-07, "kl": 0.0, "learning_rate": 3.038302277432712e-07, "loss": 0.0, "num_tokens": 39027937.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1188 }, { "completion_length": 1130.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 2683.0, "completions/max_terminated_length": 2683.0, "completions/mean_length": 1130.166748046875, "completions/mean_terminated_length": 1130.166748046875, "completions/min_length": 450.0, "completions/min_terminated_length": 450.0, "epoch": 0.4033242876526459, "frac_reward_zero_std": 1.0, "grad_norm": 2.054463550393848e-07, "kl": 0.0, "learning_rate": 3.036576949620428e-07, "loss": 0.0, "num_tokens": 39055389.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1189 }, { "completion_length": 1322.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 1990.0, "completions/max_terminated_length": 1990.0, "completions/mean_length": 1322.75, "completions/mean_terminated_length": 1322.75, "completions/min_length": 775.0, "completions/min_terminated_length": 775.0, "epoch": 0.40366350067842605, "frac_reward_zero_std": 1.0, "grad_norm": 1.6975634764548886e-07, "kl": 0.0, "learning_rate": 3.034851621808143e-07, "loss": 0.0, "num_tokens": 39086874.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1190 }, { "completion_length": 712.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1520.0, "completions/max_terminated_length": 1520.0, "completions/mean_length": 712.4166870117188, "completions/mean_terminated_length": 712.4166870117188, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.40400271370420626, "frac_reward_zero_std": 1.0, "grad_norm": 1.4425592098632478e-07, "kl": 0.0, "learning_rate": 3.0331262939958593e-07, "loss": 0.0, "num_tokens": 39105311.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1191 }, { "completion_length": 1085.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 2293.0, "completions/max_terminated_length": 2293.0, "completions/mean_length": 1085.166748046875, "completions/mean_terminated_length": 1085.166748046875, "completions/min_length": 491.0, "completions/min_terminated_length": 491.0, "epoch": 0.4043419267299864, "frac_reward_zero_std": 0.5, "grad_norm": 0.04221909120678902, "kl": 0.0, "learning_rate": 3.031400966183575e-07, "loss": 0.0, "num_tokens": 39135127.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1192 }, { "completion_length": 2253.5001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 3305.0, "completions/max_terminated_length": 3305.0, "completions/mean_length": 2253.5, "completions/mean_terminated_length": 2253.5, "completions/min_length": 1215.0, "completions/min_terminated_length": 1215.0, "epoch": 0.40468113975576664, "frac_reward_zero_std": 1.0, "grad_norm": 2.9392086275947804e-07, "kl": 0.0, "learning_rate": 3.0296756383712903e-07, "loss": 0.0, "num_tokens": 39173167.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1193 }, { "completion_length": 2800.5001220703125, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5929.0, "completions/mean_length": 3349.58349609375, "completions/mean_terminated_length": 3055.091064453125, "completions/min_length": 838.0, "completions/min_terminated_length": 838.0, "epoch": 0.4050203527815468, "frac_reward_zero_std": 0.5, "grad_norm": 0.0693206861615181, "kl": NaN, "learning_rate": 3.027950310559006e-07, "loss": -0.0137, "num_tokens": 39217873.0, "reward": 0.6750000715255737, "reward_std": 0.06123724579811096, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 1194 }, { "completion_length": 1177.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2041.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1177.8333740234375, "completions/mean_terminated_length": 1177.8333740234375, "completions/min_length": 638.0, "completions/min_terminated_length": 638.0, "epoch": 0.405359565807327, "frac_reward_zero_std": 0.5, "grad_norm": 0.31252235174179077, "kl": 0.0, "learning_rate": 3.026224982746722e-07, "loss": -0.0049, "num_tokens": 39243725.0, "reward": 1.1500000953674316, "reward_std": 0.17606817185878754, "rewards/correctness_reward_func/mean": 0.8499999642372131, "rewards/correctness_reward_func/std": 0.28444525599479675, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1195 }, { "completion_length": 1379.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 2254.0, "completions/max_terminated_length": 2254.0, "completions/mean_length": 1379.666748046875, "completions/mean_terminated_length": 1379.666748046875, "completions/min_length": 840.0, "completions/min_terminated_length": 840.0, "epoch": 0.4056987788331072, "frac_reward_zero_std": 1.0, "grad_norm": 1.9561278463697818e-07, "kl": 0.0, "learning_rate": 3.0244996549344375e-07, "loss": 0.0, "num_tokens": 39268717.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1196 }, { "completion_length": 1176.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1502.0, "completions/max_terminated_length": 1502.0, "completions/mean_length": 1176.416748046875, "completions/mean_terminated_length": 1176.416748046875, "completions/min_length": 834.0, "completions/min_terminated_length": 834.0, "epoch": 0.4060379918588874, "frac_reward_zero_std": 0.5, "grad_norm": 0.06384456157684326, "kl": 0.0, "learning_rate": 3.022774327122153e-07, "loss": -0.0001, "num_tokens": 39292794.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1197 }, { "completion_length": 2047.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3967.0, "completions/max_terminated_length": 3967.0, "completions/mean_length": 2047.3333740234375, "completions/mean_terminated_length": 2047.3333740234375, "completions/min_length": 848.0, "completions/min_terminated_length": 848.0, "epoch": 0.40637720488466755, "frac_reward_zero_std": 1.0, "grad_norm": 3.8070660934863554e-07, "kl": 0.0, "learning_rate": 3.0210489993098685e-07, "loss": 0.0, "num_tokens": 39332224.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1198 }, { "completion_length": 1152.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1865.0, "completions/max_terminated_length": 1865.0, "completions/mean_length": 1152.8333740234375, "completions/mean_terminated_length": 1152.8333740234375, "completions/min_length": 648.0, "completions/min_terminated_length": 648.0, "epoch": 0.40671641791044777, "frac_reward_zero_std": 0.5, "grad_norm": 0.047211602330207825, "kl": 0.0, "learning_rate": 3.0193236714975846e-07, "loss": -0.0009, "num_tokens": 39358868.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1199 }, { "completion_length": 1687.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4138.0, "completions/max_terminated_length": 4138.0, "completions/mean_length": 1687.8333740234375, "completions/mean_terminated_length": 1687.8333740234375, "completions/min_length": 464.0, "completions/min_terminated_length": 464.0, "epoch": 0.40705563093622793, "frac_reward_zero_std": 0.5, "grad_norm": 0.14600740373134613, "kl": 0.0, "learning_rate": 3.0175983436853e-07, "loss": -0.0035, "num_tokens": 39388488.0, "reward": 1.2666666507720947, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1200 }, { "completion_length": 975.5000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1615.0, "completions/max_terminated_length": 1615.0, "completions/mean_length": 975.5, "completions/mean_terminated_length": 975.5, "completions/min_length": 374.0, "completions/min_terminated_length": 374.0, "epoch": 0.40739484396200815, "frac_reward_zero_std": 0.5, "grad_norm": 0.0806216225028038, "kl": 0.0, "learning_rate": 3.0158730158730156e-07, "loss": -0.0001, "num_tokens": 39407274.0, "reward": 1.133333444595337, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1201 }, { "completion_length": 746.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1511.0, "completions/max_terminated_length": 1511.0, "completions/mean_length": 746.75, "completions/mean_terminated_length": 746.75, "completions/min_length": 413.0, "completions/min_terminated_length": 413.0, "epoch": 0.4077340569877883, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.014147688060731e-07, "loss": 0.0, "num_tokens": 39430701.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1202 }, { "completion_length": 2925.83349609375, "completions/clipped_ratio": 0.0, "completions/max_length": 5142.0, "completions/max_terminated_length": 5142.0, "completions/mean_length": 2925.83349609375, "completions/mean_terminated_length": 2925.83349609375, "completions/min_length": 1220.0, "completions/min_terminated_length": 1220.0, "epoch": 0.4080732700135685, "frac_reward_zero_std": 0.5, "grad_norm": 0.1083245575428009, "kl": 0.0, "learning_rate": 3.012422360248447e-07, "loss": -0.0002, "num_tokens": 39480805.0, "reward": 0.7666666507720947, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1203 }, { "completion_length": 962.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1842.0, "completions/max_terminated_length": 1842.0, "completions/mean_length": 962.4166870117188, "completions/mean_terminated_length": 962.4166870117188, "completions/min_length": 367.0, "completions/min_terminated_length": 367.0, "epoch": 0.4084124830393487, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.010697032436162e-07, "loss": 0.0, "num_tokens": 39506628.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1204 }, { "completion_length": 1508.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3435.0, "completions/max_terminated_length": 3435.0, "completions/mean_length": 1508.5833740234375, "completions/mean_terminated_length": 1508.5833740234375, "completions/min_length": 565.0, "completions/min_terminated_length": 565.0, "epoch": 0.4087516960651289, "frac_reward_zero_std": 0.5, "grad_norm": 0.1322796791791916, "kl": 0.0, "learning_rate": 3.0089717046238783e-07, "loss": 0.0001, "num_tokens": 39537241.0, "reward": 1.2666666507720947, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1205 }, { "completion_length": 1141.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2118.0, "completions/max_terminated_length": 2118.0, "completions/mean_length": 1141.416748046875, "completions/mean_terminated_length": 1141.416748046875, "completions/min_length": 500.0, "completions/min_terminated_length": 500.0, "epoch": 0.4090909090909091, "frac_reward_zero_std": 0.5, "grad_norm": 0.07480908930301666, "kl": 0.0, "learning_rate": 3.0072463768115943e-07, "loss": 0.0002, "num_tokens": 39565242.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1206 }, { "completion_length": 1428.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2778.0, "completions/max_terminated_length": 2778.0, "completions/mean_length": 1428.5, "completions/mean_terminated_length": 1428.5, "completions/min_length": 509.0, "completions/min_terminated_length": 509.0, "epoch": 0.4094301221166893, "frac_reward_zero_std": 0.0, "grad_norm": 0.4284980893135071, "kl": 0.0, "learning_rate": 3.00552104899931e-07, "loss": 0.0087, "num_tokens": 39595572.0, "reward": 1.1666667461395264, "reward_std": 0.23490381240844727, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.287096232175827, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1207 }, { "completion_length": 1202.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3776.0, "completions/max_terminated_length": 3776.0, "completions/mean_length": 1202.5, "completions/mean_terminated_length": 1202.5, "completions/min_length": 500.0, "completions/min_terminated_length": 500.0, "epoch": 0.4097693351424695, "frac_reward_zero_std": 0.5, "grad_norm": 0.09367663413286209, "kl": 0.0, "learning_rate": 3.0037957211870254e-07, "loss": 0.0015, "num_tokens": 39622932.0, "reward": 1.2166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1208 }, { "completion_length": 1222.5833740234375, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 4289.0, "completions/mean_length": 2869.83349609375, "completions/mean_terminated_length": 1630.111083984375, "completions/min_length": 471.0, "completions/min_terminated_length": 471.0, "epoch": 0.41010854816824965, "frac_reward_zero_std": 0.5, "grad_norm": 0.8850648403167725, "kl": NaN, "learning_rate": 3.002070393374741e-07, "loss": -0.0608, "num_tokens": 39650719.0, "reward": 0.8583333492279053, "reward_std": 0.2709551453590393, "rewards/correctness_reward_func/mean": 0.6333333253860474, "rewards/correctness_reward_func/std": 0.4735424220561981, "rewards/format_reward_func/mean": 0.22500000894069672, "rewards/format_reward_func/std": 0.13568010926246643, "step": 1209 }, { "completion_length": 2711.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5394.0, "completions/max_terminated_length": 5394.0, "completions/mean_length": 2711.166748046875, "completions/mean_terminated_length": 2711.166748046875, "completions/min_length": 1801.0, "completions/min_terminated_length": 1801.0, "epoch": 0.41044776119402987, "frac_reward_zero_std": 0.5, "grad_norm": 1.1114462614059448, "kl": 0.0, "learning_rate": 3.000345065562457e-07, "loss": -0.0037, "num_tokens": 39692751.0, "reward": 0.5375000238418579, "reward_std": 0.28885549306869507, "rewards/correctness_reward_func/mean": 0.25, "rewards/correctness_reward_func/std": 0.45226702094078064, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 1210 }, { "completion_length": 2506.0000610351562, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6244.0, "completions/mean_length": 3055.08349609375, "completions/mean_terminated_length": 2733.818359375, "completions/min_length": 496.0, "completions/min_terminated_length": 496.0, "epoch": 0.41078697421981003, "frac_reward_zero_std": 0.5, "grad_norm": 0.7009245157241821, "kl": NaN, "learning_rate": 2.9986197377501725e-07, "loss": -0.0598, "num_tokens": 39736731.0, "reward": 1.0541666746139526, "reward_std": 0.22716552019119263, "rewards/correctness_reward_func/mean": 0.7666666507720947, "rewards/correctness_reward_func/std": 0.3700941801071167, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 1211 }, { "completion_length": 893.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1385.0, "completions/max_terminated_length": 1385.0, "completions/mean_length": 893.4166870117188, "completions/mean_terminated_length": 893.4166870117188, "completions/min_length": 568.0, "completions/min_terminated_length": 568.0, "epoch": 0.41112618724559025, "frac_reward_zero_std": 1.0, "grad_norm": 1.6236624844623293e-07, "kl": 0.0, "learning_rate": 2.996894409937888e-07, "loss": 0.0, "num_tokens": 39757820.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1212 }, { "completion_length": 1348.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2504.0, "completions/max_terminated_length": 2504.0, "completions/mean_length": 1348.25, "completions/mean_terminated_length": 1348.25, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "epoch": 0.4114654002713704, "frac_reward_zero_std": 0.0, "grad_norm": 0.4130673408508301, "kl": 0.0, "learning_rate": 2.9951690821256036e-07, "loss": 0.0066, "num_tokens": 39792011.0, "reward": 1.0333333015441895, "reward_std": 0.2581988573074341, "rewards/correctness_reward_func/mean": 0.7333332896232605, "rewards/correctness_reward_func/std": 0.35505014657974243, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1213 }, { "completion_length": 1403.5000610351562, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 3200.0, "completions/mean_length": 1952.5833740234375, "completions/mean_terminated_length": 1531.0909423828125, "completions/min_length": 696.0, "completions/min_terminated_length": 696.0, "epoch": 0.4118046132971506, "frac_reward_zero_std": 0.5, "grad_norm": 0.07185070961713791, "kl": NaN, "learning_rate": 2.9934437543133196e-07, "loss": -0.0033, "num_tokens": 39823289.0, "reward": 0.6875001788139343, "reward_std": 0.03061862289905548, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 1214 }, { "completion_length": 2936.916748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4978.0, "completions/mean_length": 3486.0, "completions/mean_terminated_length": 3203.9091796875, "completions/min_length": 1905.0, "completions/min_terminated_length": 1905.0, "epoch": 0.4121438263229308, "frac_reward_zero_std": 0.0, "grad_norm": 0.5032631158828735, "kl": NaN, "learning_rate": 2.9917184265010346e-07, "loss": -0.0461, "num_tokens": 39871576.0, "reward": 0.7583333849906921, "reward_std": 0.42866072058677673, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 1215 }, { "completion_length": 952.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1419.0, "completions/max_terminated_length": 1419.0, "completions/mean_length": 952.9166870117188, "completions/mean_terminated_length": 952.9166870117188, "completions/min_length": 607.0, "completions/min_terminated_length": 607.0, "epoch": 0.412483039348711, "frac_reward_zero_std": 1.0, "grad_norm": 1.2045603625665535e-07, "kl": 0.0, "learning_rate": 2.9899930986887507e-07, "loss": 0.0, "num_tokens": 39893751.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1216 }, { "completion_length": 1386.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2366.0, "completions/max_terminated_length": 2366.0, "completions/mean_length": 1386.916748046875, "completions/mean_terminated_length": 1386.916748046875, "completions/min_length": 549.0, "completions/min_terminated_length": 549.0, "epoch": 0.41282225237449116, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.988267770876466e-07, "loss": 0.0, "num_tokens": 39922052.0, "reward": 0.30000001192092896, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1217 }, { "completion_length": 980.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1431.0, "completions/max_terminated_length": 1431.0, "completions/mean_length": 980.8333740234375, "completions/mean_terminated_length": 980.8333740234375, "completions/min_length": 416.0, "completions/min_terminated_length": 416.0, "epoch": 0.4131614654002714, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.986542443064182e-07, "loss": 0.0, "num_tokens": 39951876.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1218 }, { "completion_length": 2743.0001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 3330.0, "completions/max_terminated_length": 3330.0, "completions/mean_length": 2743.0, "completions/mean_terminated_length": 2743.0, "completions/min_length": 1922.0, "completions/min_terminated_length": 1922.0, "epoch": 0.41350067842605154, "frac_reward_zero_std": 0.5, "grad_norm": 0.11917182058095932, "kl": 0.0, "learning_rate": 2.984817115251897e-07, "loss": -0.0006, "num_tokens": 39998160.0, "reward": 0.75, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.44999995827674866, "rewards/correctness_reward_func/std": 0.47577688097953796, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1219 }, { "completion_length": 1473.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2102.0, "completions/max_terminated_length": 2102.0, "completions/mean_length": 1473.5, "completions/mean_terminated_length": 1473.5, "completions/min_length": 522.0, "completions/min_terminated_length": 522.0, "epoch": 0.41383989145183175, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.9830917874396133e-07, "loss": 0.0, "num_tokens": 40027518.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1220 }, { "completion_length": 748.0833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1322.0, "completions/max_terminated_length": 1322.0, "completions/mean_length": 748.0833740234375, "completions/mean_terminated_length": 748.0833740234375, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.4141791044776119, "frac_reward_zero_std": 0.5, "grad_norm": 0.06546785682439804, "kl": 0.0, "learning_rate": 2.9813664596273294e-07, "loss": 0.0002, "num_tokens": 40047907.0, "reward": 0.7666666507720947, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1221 }, { "completion_length": 1027.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1565.0, "completions/max_terminated_length": 1565.0, "completions/mean_length": 1027.5833740234375, "completions/mean_terminated_length": 1027.5833740234375, "completions/min_length": 504.0, "completions/min_terminated_length": 504.0, "epoch": 0.41451831750339213, "frac_reward_zero_std": 1.0, "grad_norm": 8.61922941908233e-08, "kl": 0.0, "learning_rate": 2.979641131815045e-07, "loss": 0.0, "num_tokens": 40074344.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1222 }, { "completion_length": 1322.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2822.0, "completions/max_terminated_length": 2822.0, "completions/mean_length": 1322.3333740234375, "completions/mean_terminated_length": 1322.3333740234375, "completions/min_length": 725.0, "completions/min_terminated_length": 725.0, "epoch": 0.41485753052917235, "frac_reward_zero_std": 1.0, "grad_norm": 1.0692743046547548e-07, "kl": 0.0, "learning_rate": 2.9779158040027604e-07, "loss": 0.0, "num_tokens": 40100010.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1223 }, { "completion_length": 2486.3333740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 3456.0, "completions/mean_length": 3035.416748046875, "completions/mean_terminated_length": 2712.36376953125, "completions/min_length": 1895.0, "completions/min_terminated_length": 1895.0, "epoch": 0.4151967435549525, "frac_reward_zero_std": 0.0, "grad_norm": 0.6539344191551208, "kl": NaN, "learning_rate": 2.976190476190476e-07, "loss": -0.0396, "num_tokens": 40145416.0, "reward": 0.8541666865348816, "reward_std": 0.46926355361938477, "rewards/correctness_reward_func/mean": 0.5666666626930237, "rewards/correctness_reward_func/std": 0.5033223032951355, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 1224 }, { "completion_length": 1018.3333740234375, "completions/clipped_ratio": 0.5, "completions/max_length": 6589.0, "completions/max_terminated_length": 4399.0, "completions/mean_length": 4312.83349609375, "completions/mean_terminated_length": 2036.666748046875, "completions/min_length": 748.0, "completions/min_terminated_length": 748.0, "epoch": 0.4155359565807327, "frac_reward_zero_std": 1.0, "grad_norm": 2.4613501636849833e-07, "kl": NaN, "learning_rate": 2.974465148378192e-07, "loss": 0.0, "num_tokens": 40173776.0, "reward": 0.5500000715255737, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.15000000596046448, "rewards/format_reward_func/std": 0.15666989982128143, "step": 1225 }, { "completion_length": 2122.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 4899.0, "completions/max_terminated_length": 4899.0, "completions/mean_length": 2122.916748046875, "completions/mean_terminated_length": 2122.916748046875, "completions/min_length": 416.0, "completions/min_terminated_length": 416.0, "epoch": 0.4158751696065129, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.9727398205659075e-07, "loss": 0.0, "num_tokens": 40213393.0, "reward": 0.30000001192092896, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1226 }, { "completion_length": 1013.6666870117188, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 2229.0, "completions/mean_length": 1562.75, "completions/mean_terminated_length": 1105.8182373046875, "completions/min_length": 476.0, "completions/min_terminated_length": 476.0, "epoch": 0.4162143826322931, "frac_reward_zero_std": 0.5, "grad_norm": 0.31930023431777954, "kl": NaN, "learning_rate": 2.971014492753623e-07, "loss": 0.0047, "num_tokens": 40238853.0, "reward": 1.0333335399627686, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.7333333492279053, "rewards/correctness_reward_func/std": 0.23094011843204498, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1227 }, { "completion_length": 1718.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3493.0, "completions/max_terminated_length": 3493.0, "completions/mean_length": 1718.416748046875, "completions/mean_terminated_length": 1718.416748046875, "completions/min_length": 670.0, "completions/min_terminated_length": 670.0, "epoch": 0.41655359565807326, "frac_reward_zero_std": 1.0, "grad_norm": 2.0217073881667602e-07, "kl": 0.0, "learning_rate": 2.9692891649413386e-07, "loss": 0.0, "num_tokens": 40269764.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1228 }, { "completion_length": 1092.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2205.0, "completions/max_terminated_length": 2205.0, "completions/mean_length": 1092.666748046875, "completions/mean_terminated_length": 1092.666748046875, "completions/min_length": 550.0, "completions/min_terminated_length": 550.0, "epoch": 0.4168928086838535, "frac_reward_zero_std": 0.5, "grad_norm": 0.05543777719140053, "kl": 0.0, "learning_rate": 2.9675638371290547e-07, "loss": -0.0004, "num_tokens": 40293646.0, "reward": 0.7833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1229 }, { "completion_length": 638.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1027.0, "completions/max_terminated_length": 1027.0, "completions/mean_length": 638.1666870117188, "completions/mean_terminated_length": 638.1666870117188, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "epoch": 0.41723202170963364, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.9658385093167697e-07, "loss": 0.0, "num_tokens": 40314258.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1230 }, { "completion_length": 941.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1846.0, "completions/max_terminated_length": 1846.0, "completions/mean_length": 941.5833740234375, "completions/mean_terminated_length": 941.5833740234375, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 0.41757123473541385, "frac_reward_zero_std": 0.0, "grad_norm": 0.10619011521339417, "kl": 0.0, "learning_rate": 2.9641131815044857e-07, "loss": 0.0012, "num_tokens": 40337359.0, "reward": 1.1666667461395264, "reward_std": 0.09559705853462219, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1231 }, { "completion_length": 1161.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1794.0, "completions/max_terminated_length": 1794.0, "completions/mean_length": 1161.5833740234375, "completions/mean_terminated_length": 1161.5833740234375, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "epoch": 0.417910447761194, "frac_reward_zero_std": 0.5, "grad_norm": 0.1446516066789627, "kl": 0.0, "learning_rate": 2.962387853692201e-07, "loss": 0.0021, "num_tokens": 40362368.0, "reward": 1.133333444595337, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1232 }, { "completion_length": 1928.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4943.0, "completions/max_terminated_length": 4943.0, "completions/mean_length": 1928.3333740234375, "completions/mean_terminated_length": 1928.3333740234375, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "epoch": 0.41824966078697423, "frac_reward_zero_std": 0.5, "grad_norm": 0.2994583249092102, "kl": 0.0, "learning_rate": 2.9606625258799173e-07, "loss": -0.0055, "num_tokens": 40393752.0, "reward": 0.36666667461395264, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.06666667014360428, "rewards/correctness_reward_func/std": 0.23094011843204498, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1233 }, { "completion_length": 1301.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4077.0, "completions/max_terminated_length": 4077.0, "completions/mean_length": 1301.3333740234375, "completions/mean_terminated_length": 1301.3333740234375, "completions/min_length": 516.0, "completions/min_terminated_length": 516.0, "epoch": 0.4185888738127544, "frac_reward_zero_std": 0.0, "grad_norm": 0.4055482745170593, "kl": 0.0, "learning_rate": 2.9589371980676323e-07, "loss": 0.0111, "num_tokens": 40421326.0, "reward": 1.1000001430511475, "reward_std": 0.3098386526107788, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.38138505816459656, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1234 }, { "completion_length": 1271.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2263.0, "completions/max_terminated_length": 2263.0, "completions/mean_length": 1271.5833740234375, "completions/mean_terminated_length": 1271.5833740234375, "completions/min_length": 594.0, "completions/min_terminated_length": 594.0, "epoch": 0.4189280868385346, "frac_reward_zero_std": 0.0, "grad_norm": 0.10634957253932953, "kl": 0.0, "learning_rate": 2.9572118702553484e-07, "loss": -0.0005, "num_tokens": 40451039.0, "reward": 1.1666667461395264, "reward_std": 0.09559705853462219, "rewards/correctness_reward_func/mean": 0.8666667342185974, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1235 }, { "completion_length": 1340.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2081.0, "completions/max_terminated_length": 2081.0, "completions/mean_length": 1340.5833740234375, "completions/mean_terminated_length": 1340.5833740234375, "completions/min_length": 624.0, "completions/min_terminated_length": 624.0, "epoch": 0.41926729986431477, "frac_reward_zero_std": 0.0, "grad_norm": 0.45237016677856445, "kl": 0.0, "learning_rate": 2.9554865424430644e-07, "loss": 0.0028, "num_tokens": 40477212.0, "reward": 1.1500000953674316, "reward_std": 0.2557639479637146, "rewards/correctness_reward_func/mean": 0.8499999642372131, "rewards/correctness_reward_func/std": 0.28444525599479675, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1236 }, { "completion_length": 2085.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4484.0, "completions/max_terminated_length": 4484.0, "completions/mean_length": 2085.166748046875, "completions/mean_terminated_length": 2085.166748046875, "completions/min_length": 463.0, "completions/min_terminated_length": 463.0, "epoch": 0.419606512890095, "frac_reward_zero_std": 0.5, "grad_norm": 0.45721933245658875, "kl": 0.0, "learning_rate": 2.95376121463078e-07, "loss": 0.0039, "num_tokens": 40512110.0, "reward": 0.9666668176651001, "reward_std": 0.20655912160873413, "rewards/correctness_reward_func/mean": 0.6666666865348816, "rewards/correctness_reward_func/std": 0.31139957904815674, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1237 }, { "completion_length": 2530.25, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4699.0, "completions/mean_length": 3079.33349609375, "completions/mean_terminated_length": 2760.272705078125, "completions/min_length": 902.0, "completions/min_terminated_length": 902.0, "epoch": 0.41994572591587515, "frac_reward_zero_std": 0.5, "grad_norm": 0.5298830270767212, "kl": NaN, "learning_rate": 2.9520358868184955e-07, "loss": -0.0302, "num_tokens": 40555475.0, "reward": 0.8083333969116211, "reward_std": 0.23327380418777466, "rewards/correctness_reward_func/mean": 0.5333333611488342, "rewards/correctness_reward_func/std": 0.3938928246498108, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 1238 }, { "completion_length": 999.5000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1720.0, "completions/max_terminated_length": 1720.0, "completions/mean_length": 999.5, "completions/mean_terminated_length": 999.5, "completions/min_length": 510.0, "completions/min_terminated_length": 510.0, "epoch": 0.42028493894165536, "frac_reward_zero_std": 1.0, "grad_norm": 9.065224304549702e-08, "kl": 0.0, "learning_rate": 2.950310559006211e-07, "loss": 0.0, "num_tokens": 40578683.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1239 }, { "completion_length": 1166.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3755.0, "completions/max_terminated_length": 3755.0, "completions/mean_length": 1166.5833740234375, "completions/mean_terminated_length": 1166.5833740234375, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "epoch": 0.4206241519674355, "frac_reward_zero_std": 0.5, "grad_norm": 0.07161006331443787, "kl": 0.0, "learning_rate": 2.948585231193927e-07, "loss": -0.0009, "num_tokens": 40604532.0, "reward": 0.7833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1240 }, { "completion_length": 1215.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2459.0, "completions/max_terminated_length": 2459.0, "completions/mean_length": 1215.75, "completions/mean_terminated_length": 1215.75, "completions/min_length": 699.0, "completions/min_terminated_length": 699.0, "epoch": 0.42096336499321574, "frac_reward_zero_std": 0.5, "grad_norm": 0.059772931039333344, "kl": 0.0, "learning_rate": 2.946859903381642e-07, "loss": -0.0002, "num_tokens": 40634379.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1241 }, { "completion_length": 1802.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 4089.0, "completions/max_terminated_length": 4089.0, "completions/mean_length": 1802.416748046875, "completions/mean_terminated_length": 1802.416748046875, "completions/min_length": 510.0, "completions/min_terminated_length": 510.0, "epoch": 0.42130257801899595, "frac_reward_zero_std": 0.5, "grad_norm": 0.04612265154719353, "kl": 0.0, "learning_rate": 2.945134575569358e-07, "loss": -0.0, "num_tokens": 40669550.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1242 }, { "completion_length": 2010.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3329.0, "completions/max_terminated_length": 3329.0, "completions/mean_length": 2010.5, "completions/mean_terminated_length": 2010.5, "completions/min_length": 970.0, "completions/min_terminated_length": 970.0, "epoch": 0.4216417910447761, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.9434092477570736e-07, "loss": 0.0, "num_tokens": 40708754.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1243 }, { "completion_length": 2327.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4802.0, "completions/max_terminated_length": 4802.0, "completions/mean_length": 2327.666748046875, "completions/mean_terminated_length": 2327.666748046875, "completions/min_length": 743.0, "completions/min_terminated_length": 743.0, "epoch": 0.42198100407055633, "frac_reward_zero_std": 0.0, "grad_norm": 0.14423444867134094, "kl": 0.0, "learning_rate": 2.9416839199447897e-07, "loss": 0.0023, "num_tokens": 40749448.0, "reward": 1.2333333492279053, "reward_std": 0.0955970287322998, "rewards/correctness_reward_func/mean": 0.9333333373069763, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1244 }, { "completion_length": 3550.75, "completions/clipped_ratio": 0.0, "completions/max_length": 6417.0, "completions/max_terminated_length": 6417.0, "completions/mean_length": 3550.75, "completions/mean_terminated_length": 3550.75, "completions/min_length": 1464.0, "completions/min_terminated_length": 1464.0, "epoch": 0.4223202170963365, "frac_reward_zero_std": 0.5, "grad_norm": 0.4263998568058014, "kl": 0.0, "learning_rate": 2.9399585921325047e-07, "loss": 0.0045, "num_tokens": 40803295.0, "reward": 1.133333444595337, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.2806917726993561, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1245 }, { "completion_length": 1927.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4710.0, "completions/max_terminated_length": 4710.0, "completions/mean_length": 1927.916748046875, "completions/mean_terminated_length": 1927.916748046875, "completions/min_length": 960.0, "completions/min_terminated_length": 960.0, "epoch": 0.4226594301221167, "frac_reward_zero_std": 0.5, "grad_norm": 0.32188019156455994, "kl": 0.0, "learning_rate": 2.938233264320221e-07, "loss": -0.0076, "num_tokens": 40838376.0, "reward": 1.1000001430511475, "reward_std": 0.20000001788139343, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.2696799635887146, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1246 }, { "completion_length": 1756.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 4104.0, "completions/max_terminated_length": 4104.0, "completions/mean_length": 1756.666748046875, "completions/mean_terminated_length": 1756.666748046875, "completions/min_length": 969.0, "completions/min_terminated_length": 969.0, "epoch": 0.42299864314789687, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.9365079365079363e-07, "loss": 0.0, "num_tokens": 40875056.0, "reward": 0.30000001192092896, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1247 }, { "completion_length": 2128.0833740234375, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 6384.0, "completions/mean_length": 3226.25, "completions/mean_terminated_length": 2553.699951171875, "completions/min_length": 561.0, "completions/min_terminated_length": 561.0, "epoch": 0.4233378561736771, "frac_reward_zero_std": 0.5, "grad_norm": 1.1419695615768433, "kl": NaN, "learning_rate": 2.9347826086956523e-07, "loss": -0.1025, "num_tokens": 40912629.0, "reward": 0.9500001668930054, "reward_std": 0.3130495250225067, "rewards/correctness_reward_func/mean": 0.699999988079071, "rewards/correctness_reward_func/std": 0.3357488214969635, "rewards/format_reward_func/mean": 0.2500000298023224, "rewards/format_reward_func/std": 0.11677484214305878, "step": 1248 }, { "completion_length": 939.9167175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 1204.0, "completions/max_terminated_length": 1204.0, "completions/mean_length": 939.9166870117188, "completions/mean_terminated_length": 939.9166870117188, "completions/min_length": 565.0, "completions/min_terminated_length": 565.0, "epoch": 0.42367706919945725, "frac_reward_zero_std": 0.5, "grad_norm": 0.08930764347314835, "kl": 0.0, "learning_rate": 2.9330572808833673e-07, "loss": 0.0017, "num_tokens": 40934624.0, "reward": 0.7333333492279053, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.43333330750465393, "rewards/correctness_reward_func/std": 0.45792683959007263, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1249 }, { "completion_length": 1535.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3041.0, "completions/max_terminated_length": 3041.0, "completions/mean_length": 1535.0833740234375, "completions/mean_terminated_length": 1535.0833740234375, "completions/min_length": 926.0, "completions/min_terminated_length": 926.0, "epoch": 0.42401628222523746, "frac_reward_zero_std": 0.5, "grad_norm": 0.08592921495437622, "kl": 0.0, "learning_rate": 2.9313319530710834e-07, "loss": 0.0002, "num_tokens": 40970037.0, "reward": 1.1500000953674316, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.8499999642372131, "rewards/correctness_reward_func/std": 0.09045340120792389, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1250 }, { "completion_length": 919.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2276.0, "completions/max_terminated_length": 2276.0, "completions/mean_length": 919.4166870117188, "completions/mean_terminated_length": 919.4166870117188, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 0.4243554952510176, "frac_reward_zero_std": 0.5, "grad_norm": 0.08206996321678162, "kl": 0.0, "learning_rate": 2.9296066252587995e-07, "loss": -0.0013, "num_tokens": 40992920.0, "reward": 1.2666666507720947, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1251 }, { "completion_length": 612.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1082.0, "completions/max_terminated_length": 1082.0, "completions/mean_length": 612.75, "completions/mean_terminated_length": 612.75, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.42469470827679784, "frac_reward_zero_std": 0.5, "grad_norm": 0.27978086471557617, "kl": 0.0, "learning_rate": 2.9278812974465145e-07, "loss": -0.0, "num_tokens": 41011949.0, "reward": 0.8833333849906921, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.5833333134651184, "rewards/correctness_reward_func/std": 0.5149286389350891, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1252 }, { "completion_length": 1159.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3352.0, "completions/max_terminated_length": 3352.0, "completions/mean_length": 1159.166748046875, "completions/mean_terminated_length": 1159.166748046875, "completions/min_length": 489.0, "completions/min_terminated_length": 489.0, "epoch": 0.425033921302578, "frac_reward_zero_std": 1.0, "grad_norm": 2.065001609707906e-07, "kl": 0.0, "learning_rate": 2.9261559696342305e-07, "loss": 0.0, "num_tokens": 41038741.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1253 }, { "completion_length": 2059.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4291.0, "completions/max_terminated_length": 4291.0, "completions/mean_length": 2059.08349609375, "completions/mean_terminated_length": 2059.08349609375, "completions/min_length": 488.0, "completions/min_terminated_length": 488.0, "epoch": 0.4253731343283582, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.924430641821946e-07, "loss": 0.0, "num_tokens": 41072000.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1254 }, { "completion_length": 3018.5001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 6117.0, "completions/max_terminated_length": 6117.0, "completions/mean_length": 3018.5, "completions/mean_terminated_length": 3018.5, "completions/min_length": 886.0, "completions/min_terminated_length": 886.0, "epoch": 0.4257123473541384, "frac_reward_zero_std": 1.0, "grad_norm": 1.6355672016743483e-07, "kl": 0.0, "learning_rate": 2.922705314009662e-07, "loss": 0.0, "num_tokens": 41116664.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1255 }, { "completion_length": 1531.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5968.0, "completions/max_terminated_length": 5968.0, "completions/mean_length": 1531.75, "completions/mean_terminated_length": 1531.75, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "epoch": 0.4260515603799186, "frac_reward_zero_std": 1.0, "grad_norm": 2.850880775895348e-07, "kl": 0.0, "learning_rate": 2.920979986197377e-07, "loss": 0.0, "num_tokens": 41145203.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1256 }, { "completion_length": 1400.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2668.0, "completions/max_terminated_length": 2668.0, "completions/mean_length": 1400.75, "completions/mean_terminated_length": 1400.75, "completions/min_length": 592.0, "completions/min_terminated_length": 592.0, "epoch": 0.42639077340569875, "frac_reward_zero_std": 1.0, "grad_norm": 1.9205478452022362e-07, "kl": 0.0, "learning_rate": 2.919254658385093e-07, "loss": 0.0, "num_tokens": 41175488.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1257 }, { "completion_length": 1414.2500610351562, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 2810.0, "completions/mean_length": 1963.3333740234375, "completions/mean_terminated_length": 1542.8182373046875, "completions/min_length": 780.0, "completions/min_terminated_length": 780.0, "epoch": 0.42672998643147897, "frac_reward_zero_std": 0.0, "grad_norm": 0.49453458189964294, "kl": NaN, "learning_rate": 2.9175293305728087e-07, "loss": -0.0228, "num_tokens": 41200997.0, "reward": 0.8083333969116211, "reward_std": 0.44362562894821167, "rewards/correctness_reward_func/mean": 0.5333333611488342, "rewards/correctness_reward_func/std": 0.3938928246498108, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 1258 }, { "completion_length": 948.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1718.0, "completions/max_terminated_length": 1718.0, "completions/mean_length": 948.0833740234375, "completions/mean_terminated_length": 948.0833740234375, "completions/min_length": 463.0, "completions/min_terminated_length": 463.0, "epoch": 0.4270691994572592, "frac_reward_zero_std": 1.0, "grad_norm": 8.066543699669637e-08, "kl": 0.0, "learning_rate": 2.915804002760525e-07, "loss": 0.0, "num_tokens": 41224716.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1259 }, { "completion_length": 2468.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5283.0, "completions/max_terminated_length": 5283.0, "completions/mean_length": 2468.166748046875, "completions/mean_terminated_length": 2468.166748046875, "completions/min_length": 771.0, "completions/min_terminated_length": 771.0, "epoch": 0.42740841248303935, "frac_reward_zero_std": 0.0, "grad_norm": 0.13931134343147278, "kl": 0.0, "learning_rate": 2.91407867494824e-07, "loss": -0.0028, "num_tokens": 41264810.0, "reward": 1.2000000476837158, "reward_std": 0.10327951610088348, "rewards/correctness_reward_func/mean": 0.9000000357627869, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1260 }, { "completion_length": 1625.5000915527344, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 4621.0, "completions/mean_length": 3272.75, "completions/mean_terminated_length": 2167.333251953125, "completions/min_length": 1059.0, "completions/min_terminated_length": 1059.0, "epoch": 0.42774762550881956, "frac_reward_zero_std": 0.5, "grad_norm": 0.6503078937530518, "kl": NaN, "learning_rate": 2.912353347135956e-07, "loss": -0.0781, "num_tokens": 41299460.0, "reward": 0.8916666507720947, "reward_std": 0.32158464193344116, "rewards/correctness_reward_func/mean": 0.6666666865348816, "rewards/correctness_reward_func/std": 0.4923659861087799, "rewards/format_reward_func/mean": 0.2250000238418579, "rewards/format_reward_func/std": 0.13568010926246643, "step": 1261 }, { "completion_length": 1470.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2410.0, "completions/max_terminated_length": 2410.0, "completions/mean_length": 1470.5833740234375, "completions/mean_terminated_length": 1470.5833740234375, "completions/min_length": 664.0, "completions/min_terminated_length": 664.0, "epoch": 0.4280868385345997, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.9106280193236713e-07, "loss": 0.0, "num_tokens": 41330661.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1262 }, { "completion_length": 1977.666748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5490.0, "completions/mean_length": 2526.75, "completions/mean_terminated_length": 2157.45458984375, "completions/min_length": 463.0, "completions/min_terminated_length": 463.0, "epoch": 0.42842605156037994, "frac_reward_zero_std": 0.5, "grad_norm": 0.15041987597942352, "kl": NaN, "learning_rate": 2.908902691511387e-07, "loss": -0.0118, "num_tokens": 41365457.0, "reward": 0.27500003576278687, "reward_std": 0.06123724579811096, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 1263 }, { "completion_length": 2472.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4144.0, "completions/max_terminated_length": 4144.0, "completions/mean_length": 2472.33349609375, "completions/mean_terminated_length": 2472.33349609375, "completions/min_length": 795.0, "completions/min_terminated_length": 795.0, "epoch": 0.4287652645861601, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.9071773636991024e-07, "loss": 0.0, "num_tokens": 41406195.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1264 }, { "completion_length": 1051.1667175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 2821.0, "completions/max_terminated_length": 2821.0, "completions/mean_length": 1051.166748046875, "completions/mean_terminated_length": 1051.166748046875, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.4291044776119403, "frac_reward_zero_std": 1.0, "grad_norm": 1.2125136095164635e-07, "kl": 0.0, "learning_rate": 2.9054520358868184e-07, "loss": 0.0, "num_tokens": 41431799.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1265 }, { "completion_length": 1767.0000610351562, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 6455.0, "completions/mean_length": 2865.166748046875, "completions/mean_terminated_length": 2120.400146484375, "completions/min_length": 496.0, "completions/min_terminated_length": 496.0, "epoch": 0.4294436906377205, "frac_reward_zero_std": 0.5, "grad_norm": 0.9176818132400513, "kl": NaN, "learning_rate": 2.9037267080745345e-07, "loss": -0.0941, "num_tokens": 41470763.0, "reward": 1.0833332538604736, "reward_std": 0.3356585204601288, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.3892494738101959, "rewards/format_reward_func/mean": 0.2500000298023224, "rewards/format_reward_func/std": 0.11677484214305878, "step": 1266 }, { "completion_length": 1748.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 4314.0, "completions/max_terminated_length": 4314.0, "completions/mean_length": 1748.916748046875, "completions/mean_terminated_length": 1748.916748046875, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "epoch": 0.4297829036635007, "frac_reward_zero_std": 1.0, "grad_norm": 1.2544077776510676e-07, "kl": 0.0, "learning_rate": 2.9020013802622495e-07, "loss": 0.0, "num_tokens": 41506258.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1267 }, { "completion_length": 1399.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 4152.0, "completions/max_terminated_length": 4152.0, "completions/mean_length": 1399.75, "completions/mean_terminated_length": 1399.75, "completions/min_length": 504.0, "completions/min_terminated_length": 504.0, "epoch": 0.43012211668928085, "frac_reward_zero_std": 1.0, "grad_norm": 2.2171617786170827e-07, "kl": 0.0, "learning_rate": 2.9002760524499656e-07, "loss": 0.0, "num_tokens": 41534959.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1268 }, { "completion_length": 1357.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2226.0, "completions/max_terminated_length": 2226.0, "completions/mean_length": 1357.25, "completions/mean_terminated_length": 1357.25, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 0.43046132971506107, "frac_reward_zero_std": 1.0, "grad_norm": 1.2577589814100065e-07, "kl": 0.0, "learning_rate": 2.898550724637681e-07, "loss": 0.0, "num_tokens": 41566528.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1269 }, { "completion_length": 617.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 833.0, "completions/max_terminated_length": 833.0, "completions/mean_length": 617.25, "completions/mean_terminated_length": 617.25, "completions/min_length": 474.0, "completions/min_terminated_length": 474.0, "epoch": 0.43080054274084123, "frac_reward_zero_std": 0.5, "grad_norm": 0.06090317294001579, "kl": 0.0, "learning_rate": 2.896825396825397e-07, "loss": 0.0003, "num_tokens": 41587657.0, "reward": 1.2666666507720947, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1270 }, { "completion_length": 2021.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4024.0, "completions/max_terminated_length": 4024.0, "completions/mean_length": 2021.5, "completions/mean_terminated_length": 2021.5, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "epoch": 0.43113975576662145, "frac_reward_zero_std": 1.0, "grad_norm": 3.5343211379768036e-07, "kl": 0.0, "learning_rate": 2.895100069013112e-07, "loss": 0.0, "num_tokens": 41626969.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1271 }, { "completion_length": 1478.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3186.0, "completions/max_terminated_length": 3186.0, "completions/mean_length": 1478.0, "completions/mean_terminated_length": 1478.0, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 0.4314789687924016, "frac_reward_zero_std": 0.5, "grad_norm": 0.40958499908447266, "kl": 0.0, "learning_rate": 2.893374741200828e-07, "loss": 0.0124, "num_tokens": 41653789.0, "reward": 0.7833334803581238, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.4833333194255829, "rewards/correctness_reward_func/std": 0.43029236793518066, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1272 }, { "completion_length": 1154.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1755.0, "completions/max_terminated_length": 1755.0, "completions/mean_length": 1154.666748046875, "completions/mean_terminated_length": 1154.666748046875, "completions/min_length": 626.0, "completions/min_terminated_length": 626.0, "epoch": 0.4318181818181818, "frac_reward_zero_std": 0.0, "grad_norm": 0.1274430900812149, "kl": 0.0, "learning_rate": 2.8916494133885437e-07, "loss": -0.0013, "num_tokens": 41682981.0, "reward": 1.183333396911621, "reward_std": 0.10641199350357056, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1273 }, { "completion_length": 1220.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1715.0, "completions/max_terminated_length": 1715.0, "completions/mean_length": 1220.666748046875, "completions/mean_terminated_length": 1220.666748046875, "completions/min_length": 731.0, "completions/min_terminated_length": 731.0, "epoch": 0.432157394843962, "frac_reward_zero_std": 1.0, "grad_norm": 1.8246319655190746e-07, "kl": 0.0, "learning_rate": 2.889924085576259e-07, "loss": 0.0, "num_tokens": 41711219.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1274 }, { "completion_length": 1417.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4358.0, "completions/max_terminated_length": 4358.0, "completions/mean_length": 1417.666748046875, "completions/mean_terminated_length": 1417.666748046875, "completions/min_length": 474.0, "completions/min_terminated_length": 474.0, "epoch": 0.4324966078697422, "frac_reward_zero_std": 1.0, "grad_norm": 2.2063801452532061e-07, "kl": 0.0, "learning_rate": 2.888198757763975e-07, "loss": 0.0, "num_tokens": 41740093.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1275 }, { "completion_length": 2354.5001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 5608.0, "completions/max_terminated_length": 5608.0, "completions/mean_length": 2354.5, "completions/mean_terminated_length": 2354.5, "completions/min_length": 676.0, "completions/min_terminated_length": 676.0, "epoch": 0.43283582089552236, "frac_reward_zero_std": 0.0, "grad_norm": 0.14763855934143066, "kl": 0.0, "learning_rate": 2.886473429951691e-07, "loss": -0.0056, "num_tokens": 41777587.0, "reward": 1.183333396911621, "reward_std": 0.10641201585531235, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1276 }, { "completion_length": 929.9167175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 1663.0, "completions/max_terminated_length": 1663.0, "completions/mean_length": 929.9166870117188, "completions/mean_terminated_length": 929.9166870117188, "completions/min_length": 494.0, "completions/min_terminated_length": 494.0, "epoch": 0.4331750339213026, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.8847481021394064e-07, "loss": 0.0, "num_tokens": 41804418.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1277 }, { "completion_length": 1276.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3759.0, "completions/max_terminated_length": 3759.0, "completions/mean_length": 1276.666748046875, "completions/mean_terminated_length": 1276.666748046875, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "epoch": 0.4335142469470828, "frac_reward_zero_std": 0.5, "grad_norm": 0.07686138898134232, "kl": 0.0, "learning_rate": 2.883022774327122e-07, "loss": -0.0011, "num_tokens": 41831168.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1278 }, { "completion_length": 2874.666748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6367.0, "completions/mean_length": 3423.75, "completions/mean_terminated_length": 3136.0, "completions/min_length": 1107.0, "completions/min_terminated_length": 1107.0, "epoch": 0.43385345997286295, "frac_reward_zero_std": 0.5, "grad_norm": 0.4911734163761139, "kl": NaN, "learning_rate": 2.8812974465148374e-07, "loss": -0.0401, "num_tokens": 41878042.0, "reward": 0.4750000238418579, "reward_std": 0.2524876296520233, "rewards/correctness_reward_func/mean": 0.20000000298023224, "rewards/correctness_reward_func/std": 0.36181360483169556, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 1279 }, { "completion_length": 792.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1339.0, "completions/max_terminated_length": 1339.0, "completions/mean_length": 792.75, "completions/mean_terminated_length": 792.75, "completions/min_length": 522.0, "completions/min_terminated_length": 522.0, "epoch": 0.43419267299864317, "frac_reward_zero_std": 0.5, "grad_norm": 0.06797429174184799, "kl": 0.0, "learning_rate": 2.8795721187025535e-07, "loss": -0.0003, "num_tokens": 41897929.0, "reward": 0.7833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1280 }, { "completion_length": 1000.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1588.0, "completions/max_terminated_length": 1588.0, "completions/mean_length": 1000.4166870117188, "completions/mean_terminated_length": 1000.4166870117188, "completions/min_length": 372.0, "completions/min_terminated_length": 372.0, "epoch": 0.43453188602442333, "frac_reward_zero_std": 0.5, "grad_norm": 0.061253227293491364, "kl": 0.0, "learning_rate": 2.877846790890269e-07, "loss": -0.0006, "num_tokens": 41921226.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1281 }, { "completion_length": 793.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2097.0, "completions/max_terminated_length": 2097.0, "completions/mean_length": 793.9166870117188, "completions/mean_terminated_length": 793.9166870117188, "completions/min_length": 395.0, "completions/min_terminated_length": 395.0, "epoch": 0.43487109905020355, "frac_reward_zero_std": 1.0, "grad_norm": 1.7298485488481674e-07, "kl": 0.0, "learning_rate": 2.8761214630779845e-07, "loss": 0.0, "num_tokens": 41943161.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1282 }, { "completion_length": 662.0833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1317.0, "completions/max_terminated_length": 1317.0, "completions/mean_length": 662.0833740234375, "completions/mean_terminated_length": 662.0833740234375, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.4352103120759837, "frac_reward_zero_std": 0.0, "grad_norm": 0.08112464100122452, "kl": 0.0, "learning_rate": 2.8743961352657006e-07, "loss": -0.0006, "num_tokens": 41961558.0, "reward": 1.2000000476837158, "reward_std": 0.08164963126182556, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1283 }, { "completion_length": 1117.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 1674.0, "completions/max_terminated_length": 1674.0, "completions/mean_length": 1117.75, "completions/mean_terminated_length": 1117.75, "completions/min_length": 526.0, "completions/min_terminated_length": 526.0, "epoch": 0.4355495251017639, "frac_reward_zero_std": 1.0, "grad_norm": 1.7792035578167997e-07, "kl": 0.0, "learning_rate": 2.872670807453416e-07, "loss": 0.0, "num_tokens": 41989683.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1284 }, { "completion_length": 2060.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4427.0, "completions/max_terminated_length": 4427.0, "completions/mean_length": 2060.08349609375, "completions/mean_terminated_length": 2060.08349609375, "completions/min_length": 778.0, "completions/min_terminated_length": 778.0, "epoch": 0.4358887381275441, "frac_reward_zero_std": 0.5, "grad_norm": 0.0860561728477478, "kl": 0.0, "learning_rate": 2.8709454796411317e-07, "loss": 0.0003, "num_tokens": 42025510.0, "reward": 0.7666666507720947, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1285 }, { "completion_length": 1391.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2755.0, "completions/max_terminated_length": 2755.0, "completions/mean_length": 1391.8333740234375, "completions/mean_terminated_length": 1391.8333740234375, "completions/min_length": 827.0, "completions/min_terminated_length": 827.0, "epoch": 0.4362279511533243, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.869220151828847e-07, "loss": 0.0, "num_tokens": 42053708.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1286 }, { "completion_length": 2253.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 6412.0, "completions/max_terminated_length": 6412.0, "completions/mean_length": 2253.916748046875, "completions/mean_terminated_length": 2253.916748046875, "completions/min_length": 606.0, "completions/min_terminated_length": 606.0, "epoch": 0.43656716417910446, "frac_reward_zero_std": 0.5, "grad_norm": 0.6813945174217224, "kl": 0.0, "learning_rate": 2.867494824016563e-07, "loss": -0.0232, "num_tokens": 42092659.0, "reward": 0.8833333849906921, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.5833333134651184, "rewards/correctness_reward_func/std": 0.5149286389350891, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1287 }, { "completion_length": 891.5000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1536.0, "completions/max_terminated_length": 1536.0, "completions/mean_length": 891.5, "completions/mean_terminated_length": 891.5, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "epoch": 0.4369063772048847, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.865769496204279e-07, "loss": 0.0, "num_tokens": 42117085.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1288 }, { "completion_length": 1306.5000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 2983.0, "completions/max_terminated_length": 2983.0, "completions/mean_length": 1306.5, "completions/mean_terminated_length": 1306.5, "completions/min_length": 422.0, "completions/min_terminated_length": 422.0, "epoch": 0.43724559023066484, "frac_reward_zero_std": 1.0, "grad_norm": 1.629204717801258e-07, "kl": 0.0, "learning_rate": 2.8640441683919943e-07, "loss": 0.0, "num_tokens": 42145831.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1289 }, { "completion_length": 1113.5000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1635.0, "completions/max_terminated_length": 1635.0, "completions/mean_length": 1113.5, "completions/mean_terminated_length": 1113.5, "completions/min_length": 685.0, "completions/min_terminated_length": 685.0, "epoch": 0.43758480325644505, "frac_reward_zero_std": 0.5, "grad_norm": 0.08702625334262848, "kl": 0.0, "learning_rate": 2.86231884057971e-07, "loss": -0.001, "num_tokens": 42173515.0, "reward": 1.25, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1290 }, { "completion_length": 1537.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2641.0, "completions/max_terminated_length": 2641.0, "completions/mean_length": 1537.0833740234375, "completions/mean_terminated_length": 1537.0833740234375, "completions/min_length": 753.0, "completions/min_terminated_length": 753.0, "epoch": 0.4379240162822252, "frac_reward_zero_std": 0.0, "grad_norm": 0.21817618608474731, "kl": 0.0, "learning_rate": 2.860593512767426e-07, "loss": -0.0025, "num_tokens": 42202100.0, "reward": 1.2166666984558105, "reward_std": 0.09246458113193512, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298573225736618, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1291 }, { "completion_length": 2236.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 4469.0, "completions/max_terminated_length": 4469.0, "completions/mean_length": 2236.25, "completions/mean_terminated_length": 2236.25, "completions/min_length": 957.0, "completions/min_terminated_length": 957.0, "epoch": 0.43826322930800543, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.8588681849551414e-07, "loss": 0.0, "num_tokens": 42238349.0, "reward": 0.30000001192092896, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1292 }, { "completion_length": 1417.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2724.0, "completions/max_terminated_length": 2724.0, "completions/mean_length": 1417.416748046875, "completions/mean_terminated_length": 1417.416748046875, "completions/min_length": 723.0, "completions/min_terminated_length": 723.0, "epoch": 0.4386024423337856, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.857142857142857e-07, "loss": 0.0, "num_tokens": 42266458.0, "reward": 0.30000001192092896, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1293 }, { "completion_length": 2986.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5996.0, "completions/max_terminated_length": 5996.0, "completions/mean_length": 2986.916748046875, "completions/mean_terminated_length": 2986.916748046875, "completions/min_length": 957.0, "completions/min_terminated_length": 957.0, "epoch": 0.4389416553595658, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.8554175293305725e-07, "loss": 0.0, "num_tokens": 42314133.0, "reward": 0.30000001192092896, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1294 }, { "completion_length": 1779.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3870.0, "completions/max_terminated_length": 3870.0, "completions/mean_length": 1779.166748046875, "completions/mean_terminated_length": 1779.166748046875, "completions/min_length": 665.0, "completions/min_terminated_length": 665.0, "epoch": 0.439280868385346, "frac_reward_zero_std": 0.5, "grad_norm": 0.10210628062486649, "kl": 0.0, "learning_rate": 2.8536922015182885e-07, "loss": 0.0043, "num_tokens": 42346283.0, "reward": 1.2333333492279053, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.9333333373069763, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1295 }, { "completion_length": 2721.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 6307.0, "completions/max_terminated_length": 6307.0, "completions/mean_length": 2721.08349609375, "completions/mean_terminated_length": 2721.08349609375, "completions/min_length": 520.0, "completions/min_terminated_length": 520.0, "epoch": 0.4396200814111262, "frac_reward_zero_std": 0.0, "grad_norm": 0.1718180775642395, "kl": 0.0, "learning_rate": 2.8519668737060035e-07, "loss": 0.0085, "num_tokens": 42394014.0, "reward": 1.25, "reward_std": 0.09246455878019333, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1296 }, { "completion_length": 2141.3334350585938, "completions/clipped_ratio": 0.0, "completions/max_length": 4843.0, "completions/max_terminated_length": 4843.0, "completions/mean_length": 2141.33349609375, "completions/mean_terminated_length": 2141.33349609375, "completions/min_length": 553.0, "completions/min_terminated_length": 553.0, "epoch": 0.4399592944369064, "frac_reward_zero_std": 1.0, "grad_norm": 2.1731621302478743e-07, "kl": 0.0, "learning_rate": 2.8502415458937196e-07, "loss": 0.0, "num_tokens": 42429538.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1297 }, { "completion_length": 1390.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3996.0, "completions/max_terminated_length": 3996.0, "completions/mean_length": 1390.916748046875, "completions/mean_terminated_length": 1390.916748046875, "completions/min_length": 628.0, "completions/min_terminated_length": 628.0, "epoch": 0.44029850746268656, "frac_reward_zero_std": 0.5, "grad_norm": 0.07795817404985428, "kl": 0.0, "learning_rate": 2.8485162180814356e-07, "loss": -0.0004, "num_tokens": 42460323.0, "reward": 1.1666667461395264, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1298 }, { "completion_length": 3076.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 6103.0, "completions/max_terminated_length": 6103.0, "completions/mean_length": 3076.916748046875, "completions/mean_terminated_length": 3076.916748046875, "completions/min_length": 1045.0, "completions/min_terminated_length": 1045.0, "epoch": 0.4406377204884668, "frac_reward_zero_std": 0.5, "grad_norm": 0.07325087487697601, "kl": 0.0, "learning_rate": 2.846790890269151e-07, "loss": 0.0049, "num_tokens": 42508370.0, "reward": 0.7166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1299 }, { "completion_length": 2096.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3320.0, "completions/max_terminated_length": 3320.0, "completions/mean_length": 2096.416748046875, "completions/mean_terminated_length": 2096.416748046875, "completions/min_length": 1034.0, "completions/min_terminated_length": 1034.0, "epoch": 0.44097693351424694, "frac_reward_zero_std": 1.0, "grad_norm": 1.182023154910894e-07, "kl": 0.0, "learning_rate": 2.8450655624568667e-07, "loss": 0.0, "num_tokens": 42546961.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1300 }, { "completion_length": 1965.25, "completions/clipped_ratio": 0.0, "completions/max_length": 6316.0, "completions/max_terminated_length": 6316.0, "completions/mean_length": 1965.25, "completions/mean_terminated_length": 1965.25, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.44131614654002715, "frac_reward_zero_std": 0.0, "grad_norm": 0.9132653474807739, "kl": 0.0, "learning_rate": 2.843340234644582e-07, "loss": -0.0353, "num_tokens": 42583942.0, "reward": 0.8666666746139526, "reward_std": 0.24494892358779907, "rewards/correctness_reward_func/mean": 0.5666666626930237, "rewards/correctness_reward_func/std": 0.5033223032951355, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1301 }, { "completion_length": 1037.6667175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 1947.0, "completions/max_terminated_length": 1947.0, "completions/mean_length": 1037.666748046875, "completions/mean_terminated_length": 1037.666748046875, "completions/min_length": 496.0, "completions/min_terminated_length": 496.0, "epoch": 0.4416553595658073, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.8416149068322983e-07, "loss": 0.0, "num_tokens": 42603822.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1302 }, { "completion_length": 1120.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 1579.0, "completions/max_terminated_length": 1579.0, "completions/mean_length": 1120.0, "completions/mean_terminated_length": 1120.0, "completions/min_length": 852.0, "completions/min_terminated_length": 852.0, "epoch": 0.44199457259158753, "frac_reward_zero_std": 0.5, "grad_norm": 0.06896739453077316, "kl": 0.0, "learning_rate": 2.839889579020014e-07, "loss": -0.0002, "num_tokens": 42631404.0, "reward": 1.1666667461395264, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1303 }, { "completion_length": 968.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1616.0, "completions/max_terminated_length": 1616.0, "completions/mean_length": 968.3333740234375, "completions/mean_terminated_length": 968.3333740234375, "completions/min_length": 519.0, "completions/min_terminated_length": 519.0, "epoch": 0.4423337856173677, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.8381642512077293e-07, "loss": 0.0, "num_tokens": 42655216.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1304 }, { "completion_length": 2107.0001220703125, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 5752.0, "completions/mean_length": 3205.166748046875, "completions/mean_terminated_length": 2528.400146484375, "completions/min_length": 1010.0, "completions/min_terminated_length": 1010.0, "epoch": 0.4426729986431479, "frac_reward_zero_std": 0.5, "grad_norm": 0.6942918300628662, "kl": NaN, "learning_rate": 2.836438923395445e-07, "loss": -0.0526, "num_tokens": 42692404.0, "reward": 0.8000000715255737, "reward_std": 0.28106939792633057, "rewards/correctness_reward_func/mean": 0.550000011920929, "rewards/correctness_reward_func/std": 0.4100997745990753, "rewards/format_reward_func/mean": 0.2500000298023224, "rewards/format_reward_func/std": 0.11677484214305878, "step": 1305 }, { "completion_length": 1075.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 2543.0, "completions/mean_length": 1624.0833740234375, "completions/mean_terminated_length": 1172.727294921875, "completions/min_length": 518.0, "completions/min_terminated_length": 518.0, "epoch": 0.44301221166892807, "frac_reward_zero_std": 0.5, "grad_norm": 0.20525957643985748, "kl": NaN, "learning_rate": 2.834713595583161e-07, "loss": -0.0204, "num_tokens": 42720010.0, "reward": 0.658333420753479, "reward_std": 0.25380438566207886, "rewards/correctness_reward_func/mean": 0.38333332538604736, "rewards/correctness_reward_func/std": 0.4783177673816681, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 1306 }, { "completion_length": 1147.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2110.0, "completions/max_terminated_length": 2110.0, "completions/mean_length": 1147.666748046875, "completions/mean_terminated_length": 1147.666748046875, "completions/min_length": 565.0, "completions/min_terminated_length": 565.0, "epoch": 0.4433514246947083, "frac_reward_zero_std": 0.0, "grad_norm": 0.5518262982368469, "kl": 0.0, "learning_rate": 2.832988267770876e-07, "loss": -0.005, "num_tokens": 42746754.0, "reward": 0.5833333730697632, "reward_std": 0.4232131838798523, "rewards/correctness_reward_func/mean": 0.28333333134651184, "rewards/correctness_reward_func/std": 0.4217568039894104, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1307 }, { "completion_length": 1436.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 5349.0, "completions/max_terminated_length": 5349.0, "completions/mean_length": 1436.166748046875, "completions/mean_terminated_length": 1436.166748046875, "completions/min_length": 526.0, "completions/min_terminated_length": 526.0, "epoch": 0.44369063772048845, "frac_reward_zero_std": 0.5, "grad_norm": 0.09310908615589142, "kl": 0.0, "learning_rate": 2.831262939958592e-07, "loss": -0.0058, "num_tokens": 42776432.0, "reward": 1.2666666507720947, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1308 }, { "completion_length": 1848.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 4914.0, "completions/max_terminated_length": 4914.0, "completions/mean_length": 1848.25, "completions/mean_terminated_length": 1848.25, "completions/min_length": 699.0, "completions/min_terminated_length": 699.0, "epoch": 0.44402985074626866, "frac_reward_zero_std": 0.0, "grad_norm": 0.6156511902809143, "kl": 0.0, "learning_rate": 2.8295376121463075e-07, "loss": -0.0039, "num_tokens": 42816005.0, "reward": 0.9166666865348816, "reward_std": 0.4232131838798523, "rewards/correctness_reward_func/mean": 0.6166666150093079, "rewards/correctness_reward_func/std": 0.4628632962703705, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1309 }, { "completion_length": 777.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1345.0, "completions/max_terminated_length": 1345.0, "completions/mean_length": 777.8333740234375, "completions/mean_terminated_length": 777.8333740234375, "completions/min_length": 406.0, "completions/min_terminated_length": 406.0, "epoch": 0.4443690637720488, "frac_reward_zero_std": 0.0, "grad_norm": 0.09817202389240265, "kl": 0.0, "learning_rate": 2.8278122843340236e-07, "loss": -0.0018, "num_tokens": 42841497.0, "reward": 1.1666667461395264, "reward_std": 0.10327952355146408, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1310 }, { "completion_length": 1379.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2547.0, "completions/max_terminated_length": 2547.0, "completions/mean_length": 1379.5, "completions/mean_terminated_length": 1379.5, "completions/min_length": 632.0, "completions/min_terminated_length": 632.0, "epoch": 0.44470827679782904, "frac_reward_zero_std": 0.5, "grad_norm": 0.06035196781158447, "kl": 0.0, "learning_rate": 2.8260869565217386e-07, "loss": -0.0017, "num_tokens": 42870141.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1311 }, { "completion_length": 1255.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2848.0, "completions/max_terminated_length": 2848.0, "completions/mean_length": 1255.0833740234375, "completions/mean_terminated_length": 1255.0833740234375, "completions/min_length": 539.0, "completions/min_terminated_length": 539.0, "epoch": 0.4450474898236092, "frac_reward_zero_std": 0.5, "grad_norm": 0.08878006041049957, "kl": 0.0, "learning_rate": 2.8243616287094546e-07, "loss": -0.0041, "num_tokens": 42897556.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1312 }, { "completion_length": 791.0000305175781, "completions/clipped_ratio": 0.41666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 3433.0, "completions/mean_length": 3536.416748046875, "completions/mean_terminated_length": 1356.0, "completions/min_length": 650.0, "completions/min_terminated_length": 650.0, "epoch": 0.4453867028493894, "frac_reward_zero_std": 0.5, "grad_norm": 0.1398114562034607, "kl": NaN, "learning_rate": 2.8226363008971707e-07, "loss": -0.0109, "num_tokens": 42918700.0, "reward": 0.5750000476837158, "reward_std": 0.06123724579811096, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.17499999701976776, "rewards/format_reward_func/std": 0.15447859466075897, "step": 1313 }, { "completion_length": 1822.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3173.0, "completions/max_terminated_length": 3173.0, "completions/mean_length": 1822.416748046875, "completions/mean_terminated_length": 1822.416748046875, "completions/min_length": 387.0, "completions/min_terminated_length": 387.0, "epoch": 0.44572591587516963, "frac_reward_zero_std": 0.5, "grad_norm": 0.6276273727416992, "kl": 0.0, "learning_rate": 2.820910973084886e-07, "loss": -0.0169, "num_tokens": 42953679.0, "reward": 0.46666666865348816, "reward_std": 0.2581988573074341, "rewards/correctness_reward_func/mean": 0.1666666716337204, "rewards/correctness_reward_func/std": 0.3892494738101959, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1314 }, { "completion_length": 929.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3021.0, "completions/max_terminated_length": 3021.0, "completions/mean_length": 929.3333740234375, "completions/mean_terminated_length": 929.3333740234375, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "epoch": 0.4460651289009498, "frac_reward_zero_std": 0.5, "grad_norm": 0.05867432430386543, "kl": 0.0, "learning_rate": 2.819185645272602e-07, "loss": 0.0016, "num_tokens": 42975499.0, "reward": 1.2166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298573225736618, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1315 }, { "completion_length": 1247.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 1927.0, "completions/max_terminated_length": 1927.0, "completions/mean_length": 1247.0, "completions/mean_terminated_length": 1247.0, "completions/min_length": 676.0, "completions/min_terminated_length": 676.0, "epoch": 0.44640434192673, "frac_reward_zero_std": 0.0, "grad_norm": 0.37302249670028687, "kl": 0.0, "learning_rate": 2.8174603174603173e-07, "loss": 0.0022, "num_tokens": 43003879.0, "reward": 0.9166666865348816, "reward_std": 0.24738392233848572, "rewards/correctness_reward_func/mean": 0.6166666150093079, "rewards/correctness_reward_func/std": 0.4628632962703705, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1316 }, { "completion_length": 1275.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2561.0, "completions/max_terminated_length": 2561.0, "completions/mean_length": 1275.25, "completions/mean_terminated_length": 1275.25, "completions/min_length": 371.0, "completions/min_terminated_length": 371.0, "epoch": 0.44674355495251017, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.8157349896480333e-07, "loss": 0.0, "num_tokens": 43031290.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1317 }, { "completion_length": 1122.5000610351562, "completions/clipped_ratio": 0.33333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 3626.0, "completions/mean_length": 3318.83349609375, "completions/mean_terminated_length": 1683.75, "completions/min_length": 719.0, "completions/min_terminated_length": 719.0, "epoch": 0.4470827679782904, "frac_reward_zero_std": 0.5, "grad_norm": 0.16851428151130676, "kl": NaN, "learning_rate": 2.8140096618357483e-07, "loss": -0.0157, "num_tokens": 43058764.0, "reward": 0.6000000834465027, "reward_std": 0.0774596780538559, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.14770980179309845, "step": 1318 }, { "completion_length": 1469.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 4416.0, "completions/max_terminated_length": 4416.0, "completions/mean_length": 1469.5, "completions/mean_terminated_length": 1469.5, "completions/min_length": 513.0, "completions/min_terminated_length": 513.0, "epoch": 0.44742198100407055, "frac_reward_zero_std": 0.5, "grad_norm": 0.7298623323440552, "kl": 0.0, "learning_rate": 2.8122843340234644e-07, "loss": 0.0407, "num_tokens": 43088062.0, "reward": 0.6333333253860474, "reward_std": 0.25819891691207886, "rewards/correctness_reward_func/mean": 0.3333333432674408, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1319 }, { "completion_length": 2309.7501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 6214.0, "completions/max_terminated_length": 6214.0, "completions/mean_length": 2309.75, "completions/mean_terminated_length": 2309.75, "completions/min_length": 575.0, "completions/min_terminated_length": 575.0, "epoch": 0.44776119402985076, "frac_reward_zero_std": 0.5, "grad_norm": 0.547522783279419, "kl": 0.0, "learning_rate": 2.81055900621118e-07, "loss": -0.0402, "num_tokens": 43128217.0, "reward": 1.2000000476837158, "reward_std": 0.20000001788139343, "rewards/correctness_reward_func/mean": 0.9000000357627869, "rewards/correctness_reward_func/std": 0.28919950127601624, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1320 }, { "completion_length": 1472.8333740234375, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 4186.0, "completions/mean_length": 3120.08349609375, "completions/mean_terminated_length": 1963.77783203125, "completions/min_length": 669.0, "completions/min_terminated_length": 669.0, "epoch": 0.4481004070556309, "frac_reward_zero_std": 0.0, "grad_norm": 0.8402412533760071, "kl": NaN, "learning_rate": 2.808833678398896e-07, "loss": -0.0615, "num_tokens": 43159001.0, "reward": 0.3458333909511566, "reward_std": 0.299697607755661, "rewards/correctness_reward_func/mean": 0.13333334028720856, "rewards/correctness_reward_func/std": 0.3113996088504791, "rewards/format_reward_func/mean": 0.21250002086162567, "rewards/format_reward_func/std": 0.11894422769546509, "step": 1321 }, { "completion_length": 701.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 971.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 701.9166870117188, "completions/mean_terminated_length": 701.9166870117188, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "epoch": 0.44843962008141114, "frac_reward_zero_std": 0.5, "grad_norm": 0.04753837361931801, "kl": 0.0, "learning_rate": 2.807108350586611e-07, "loss": -0.0003, "num_tokens": 43181716.0, "reward": 0.7833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1322 }, { "completion_length": 1093.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1804.0, "completions/max_terminated_length": 1804.0, "completions/mean_length": 1093.75, "completions/mean_terminated_length": 1093.75, "completions/min_length": 776.0, "completions/min_terminated_length": 776.0, "epoch": 0.4487788331071913, "frac_reward_zero_std": 0.5, "grad_norm": 0.089720219373703, "kl": 0.0, "learning_rate": 2.805383022774327e-07, "loss": -0.0011, "num_tokens": 43200493.0, "reward": 1.1666667461395264, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1323 }, { "completion_length": 1610.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 6400.0, "completions/max_terminated_length": 6400.0, "completions/mean_length": 1610.416748046875, "completions/mean_terminated_length": 1610.416748046875, "completions/min_length": 513.0, "completions/min_terminated_length": 513.0, "epoch": 0.4491180461329715, "frac_reward_zero_std": 0.0, "grad_norm": 0.14426280558109283, "kl": 0.0, "learning_rate": 2.8036576949620426e-07, "loss": 0.0092, "num_tokens": 43229370.0, "reward": 1.2333333492279053, "reward_std": 0.0955970287322998, "rewards/correctness_reward_func/mean": 0.9333333373069763, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1324 }, { "completion_length": 2023.0001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 5275.0, "completions/max_terminated_length": 5275.0, "completions/mean_length": 2023.0, "completions/mean_terminated_length": 2023.0, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "epoch": 0.4494572591587517, "frac_reward_zero_std": 0.0, "grad_norm": 0.5809135437011719, "kl": 0.0, "learning_rate": 2.8019323671497586e-07, "loss": 0.0073, "num_tokens": 43263204.0, "reward": 1.0, "reward_std": 0.26368504762649536, "rewards/correctness_reward_func/mean": 0.699999988079071, "rewards/correctness_reward_func/std": 0.3357488214969635, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1325 }, { "completion_length": 791.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1488.0, "completions/max_terminated_length": 1488.0, "completions/mean_length": 791.9166870117188, "completions/mean_terminated_length": 791.9166870117188, "completions/min_length": 461.0, "completions/min_terminated_length": 461.0, "epoch": 0.4497964721845319, "frac_reward_zero_std": 1.0, "grad_norm": 1.2329631715601863e-07, "kl": 0.0, "learning_rate": 2.8002070393374736e-07, "loss": 0.0, "num_tokens": 43288817.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1326 }, { "completion_length": 1349.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 2642.0, "completions/max_terminated_length": 2642.0, "completions/mean_length": 1349.166748046875, "completions/mean_terminated_length": 1349.166748046875, "completions/min_length": 665.0, "completions/min_terminated_length": 665.0, "epoch": 0.45013568521031205, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.7984817115251897e-07, "loss": 0.0, "num_tokens": 43318699.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1327 }, { "completion_length": 2010.25, "completions/clipped_ratio": 0.33333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5986.0, "completions/mean_length": 4206.58349609375, "completions/mean_terminated_length": 3015.375, "completions/min_length": 991.0, "completions/min_terminated_length": 991.0, "epoch": 0.45047489823609227, "frac_reward_zero_std": 0.0, "grad_norm": 0.7470451593399048, "kl": NaN, "learning_rate": 2.796756383712905e-07, "loss": -0.0824, "num_tokens": 43358542.0, "reward": 0.7000000476837158, "reward_std": 0.3009530007839203, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.4472135901451111, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.14770980179309845, "step": 1328 }, { "completion_length": 2980.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4580.0, "completions/max_terminated_length": 4580.0, "completions/mean_length": 2980.58349609375, "completions/mean_terminated_length": 2980.58349609375, "completions/min_length": 1358.0, "completions/min_terminated_length": 1358.0, "epoch": 0.45081411126187243, "frac_reward_zero_std": 0.5, "grad_norm": 0.7985662817955017, "kl": 0.0, "learning_rate": 2.7950310559006207e-07, "loss": 0.0173, "num_tokens": 43407461.0, "reward": 0.7166666984558105, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.5149286985397339, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1329 }, { "completion_length": 2130.5833740234375, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 4292.0, "completions/mean_length": 3777.83349609375, "completions/mean_terminated_length": 2840.77783203125, "completions/min_length": 1773.0, "completions/min_terminated_length": 1773.0, "epoch": 0.45115332428765265, "frac_reward_zero_std": 0.0, "grad_norm": 1.1298385858535767, "kl": NaN, "learning_rate": 2.793305728088337e-07, "loss": -0.0642, "num_tokens": 43447824.0, "reward": 0.875, "reward_std": 0.5438544154167175, "rewards/correctness_reward_func/mean": 0.6500000357627869, "rewards/correctness_reward_func/std": 0.4833594560623169, "rewards/format_reward_func/mean": 0.22500000894069672, "rewards/format_reward_func/std": 0.13568010926246643, "step": 1330 }, { "completion_length": 1151.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2310.0, "completions/max_terminated_length": 2310.0, "completions/mean_length": 1151.25, "completions/mean_terminated_length": 1151.25, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "epoch": 0.45149253731343286, "frac_reward_zero_std": 0.0, "grad_norm": 0.10321547091007233, "kl": 0.0, "learning_rate": 2.7915804002760523e-07, "loss": 0.0014, "num_tokens": 43473567.0, "reward": 1.1666667461395264, "reward_std": 0.09559705853462219, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1331 }, { "completion_length": 1507.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2385.0, "completions/max_terminated_length": 2385.0, "completions/mean_length": 1507.0, "completions/mean_terminated_length": 1507.0, "completions/min_length": 1056.0, "completions/min_terminated_length": 1056.0, "epoch": 0.451831750339213, "frac_reward_zero_std": 1.0, "grad_norm": 3.1915237741486635e-07, "kl": 0.0, "learning_rate": 2.7898550724637684e-07, "loss": 0.0, "num_tokens": 43506375.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1332 }, { "completion_length": 1420.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3128.0, "completions/max_terminated_length": 3128.0, "completions/mean_length": 1420.166748046875, "completions/mean_terminated_length": 1420.166748046875, "completions/min_length": 531.0, "completions/min_terminated_length": 531.0, "epoch": 0.45217096336499324, "frac_reward_zero_std": 0.5, "grad_norm": 0.07904180884361267, "kl": 0.0, "learning_rate": 2.7881297446514834e-07, "loss": -0.0005, "num_tokens": 43539731.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1333 }, { "completion_length": 905.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1610.0, "completions/max_terminated_length": 1610.0, "completions/mean_length": 905.9166870117188, "completions/mean_terminated_length": 905.9166870117188, "completions/min_length": 512.0, "completions/min_terminated_length": 512.0, "epoch": 0.4525101763907734, "frac_reward_zero_std": 0.5, "grad_norm": 0.05316146835684776, "kl": 0.0, "learning_rate": 2.7864044168391994e-07, "loss": 0.0001, "num_tokens": 43562044.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1334 }, { "completion_length": 1817.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4331.0, "completions/max_terminated_length": 4331.0, "completions/mean_length": 1817.416748046875, "completions/mean_terminated_length": 1817.416748046875, "completions/min_length": 500.0, "completions/min_terminated_length": 500.0, "epoch": 0.4528493894165536, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.784679089026915e-07, "loss": 0.0, "num_tokens": 43597329.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1335 }, { "completion_length": 1036.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2694.0, "completions/max_terminated_length": 2694.0, "completions/mean_length": 1036.5, "completions/mean_terminated_length": 1036.5, "completions/min_length": 430.0, "completions/min_terminated_length": 430.0, "epoch": 0.4531886024423338, "frac_reward_zero_std": 0.5, "grad_norm": 0.3732525408267975, "kl": 0.0, "learning_rate": 2.782953761214631e-07, "loss": -0.006, "num_tokens": 43625085.0, "reward": 1.2166666984558105, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1336 }, { "completion_length": 2628.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4193.0, "completions/max_terminated_length": 4193.0, "completions/mean_length": 2628.0, "completions/mean_terminated_length": 2628.0, "completions/min_length": 885.0, "completions/min_terminated_length": 885.0, "epoch": 0.453527815468114, "frac_reward_zero_std": 0.5, "grad_norm": 0.09271878749132156, "kl": 0.0, "learning_rate": 2.781228433402346e-07, "loss": -0.0007, "num_tokens": 43664895.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1337 }, { "completion_length": 1032.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4084.0, "completions/max_terminated_length": 4084.0, "completions/mean_length": 1032.0, "completions/mean_terminated_length": 1032.0, "completions/min_length": 478.0, "completions/min_terminated_length": 478.0, "epoch": 0.45386702849389415, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.779503105590062e-07, "loss": 0.0, "num_tokens": 43684533.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1338 }, { "completion_length": 2683.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 5694.0, "completions/max_terminated_length": 5694.0, "completions/mean_length": 2683.08349609375, "completions/mean_terminated_length": 2683.08349609375, "completions/min_length": 586.0, "completions/min_terminated_length": 586.0, "epoch": 0.45420624151967437, "frac_reward_zero_std": 0.5, "grad_norm": 0.048041682690382004, "kl": 0.0, "learning_rate": 2.7777777777777776e-07, "loss": 0.0001, "num_tokens": 43731070.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1339 }, { "completion_length": 911.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2079.0, "completions/max_terminated_length": 2079.0, "completions/mean_length": 911.4166870117188, "completions/mean_terminated_length": 911.4166870117188, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 0.45454545454545453, "frac_reward_zero_std": 0.5, "grad_norm": 0.06463975459337234, "kl": 0.0, "learning_rate": 2.7760524499654937e-07, "loss": -0.0007, "num_tokens": 43753875.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1340 }, { "completion_length": 1919.9166870117188, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5548.0, "completions/mean_length": 2469.0, "completions/mean_terminated_length": 2094.45458984375, "completions/min_length": 545.0, "completions/min_terminated_length": 545.0, "epoch": 0.45488466757123475, "frac_reward_zero_std": 0.5, "grad_norm": 0.03303459286689758, "kl": NaN, "learning_rate": 2.7743271221532086e-07, "loss": -0.0057, "num_tokens": 43789784.0, "reward": 0.7875000834465027, "reward_std": 0.03061862289905548, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 1341 }, { "completion_length": 673.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 814.0, "completions/max_terminated_length": 814.0, "completions/mean_length": 673.25, "completions/mean_terminated_length": 673.25, "completions/min_length": 446.0, "completions/min_terminated_length": 446.0, "epoch": 0.4552238805970149, "frac_reward_zero_std": 0.5, "grad_norm": 0.06551811844110489, "kl": 0.0, "learning_rate": 2.7726017943409247e-07, "loss": -0.0001, "num_tokens": 43812041.0, "reward": 0.7749999761581421, "reward_std": 0.03872983902692795, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 1342 }, { "completion_length": 1351.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3871.0, "completions/max_terminated_length": 3871.0, "completions/mean_length": 1351.3333740234375, "completions/mean_terminated_length": 1351.3333740234375, "completions/min_length": 412.0, "completions/min_terminated_length": 412.0, "epoch": 0.4555630936227951, "frac_reward_zero_std": 0.5, "grad_norm": 0.05357801169157028, "kl": 0.0, "learning_rate": 2.77087646652864e-07, "loss": -0.0006, "num_tokens": 43837287.0, "reward": 0.7166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1343 }, { "completion_length": 2977.2501220703125, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 5870.0, "completions/mean_length": 4075.416748046875, "completions/mean_terminated_length": 3572.699951171875, "completions/min_length": 488.0, "completions/min_terminated_length": 488.0, "epoch": 0.4559023066485753, "frac_reward_zero_std": 0.5, "grad_norm": 0.7537221908569336, "kl": NaN, "learning_rate": 2.769151138716356e-07, "loss": -0.0727, "num_tokens": 43885236.0, "reward": 0.4166666865348816, "reward_std": 0.30441200733184814, "rewards/correctness_reward_func/mean": 0.1666666716337204, "rewards/correctness_reward_func/std": 0.3892494738101959, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.11677484214305878, "step": 1344 }, { "completion_length": 1464.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2661.0, "completions/max_terminated_length": 2661.0, "completions/mean_length": 1464.916748046875, "completions/mean_terminated_length": 1464.916748046875, "completions/min_length": 584.0, "completions/min_terminated_length": 584.0, "epoch": 0.4562415196743555, "frac_reward_zero_std": 1.0, "grad_norm": 1.5211128356895642e-07, "kl": 0.0, "learning_rate": 2.767425810904072e-07, "loss": 0.0, "num_tokens": 43912793.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1345 }, { "completion_length": 1069.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2525.0, "completions/max_terminated_length": 2525.0, "completions/mean_length": 1069.25, "completions/mean_terminated_length": 1069.25, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 0.45658073270013566, "frac_reward_zero_std": 0.5, "grad_norm": 0.0766829252243042, "kl": 0.0, "learning_rate": 2.7657004830917874e-07, "loss": 0.0001, "num_tokens": 43934594.0, "reward": 1.0750000476837158, "reward_std": 0.038729824125766754, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 1346 }, { "completion_length": 1273.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2164.0, "completions/max_terminated_length": 2164.0, "completions/mean_length": 1273.8333740234375, "completions/mean_terminated_length": 1273.8333740234375, "completions/min_length": 584.0, "completions/min_terminated_length": 584.0, "epoch": 0.4569199457259159, "frac_reward_zero_std": 1.0, "grad_norm": 1.1623317419662271e-07, "kl": 0.0, "learning_rate": 2.7639751552795034e-07, "loss": 0.0, "num_tokens": 43962798.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1347 }, { "completion_length": 3138.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5449.0, "completions/max_terminated_length": 5449.0, "completions/mean_length": 3138.666748046875, "completions/mean_terminated_length": 3138.666748046875, "completions/min_length": 748.0, "completions/min_terminated_length": 748.0, "epoch": 0.45725915875169604, "frac_reward_zero_std": 0.5, "grad_norm": 0.6138724684715271, "kl": 0.0, "learning_rate": 2.7622498274672184e-07, "loss": -0.0102, "num_tokens": 44013542.0, "reward": 1.0333335399627686, "reward_std": 0.25819891691207886, "rewards/correctness_reward_func/mean": 0.7333333492279053, "rewards/correctness_reward_func/std": 0.35505014657974243, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1348 }, { "completion_length": 2888.0001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 6087.0, "completions/max_terminated_length": 6087.0, "completions/mean_length": 2888.0, "completions/mean_terminated_length": 2888.0, "completions/min_length": 1690.0, "completions/min_terminated_length": 1690.0, "epoch": 0.45759837177747625, "frac_reward_zero_std": 0.5, "grad_norm": 0.10499357432126999, "kl": 0.0, "learning_rate": 2.7605244996549345e-07, "loss": 0.0003, "num_tokens": 44064686.0, "reward": 0.7833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1349 }, { "completion_length": 2398.25, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6390.0, "completions/mean_length": 2947.33349609375, "completions/mean_terminated_length": 2616.272705078125, "completions/min_length": 977.0, "completions/min_terminated_length": 977.0, "epoch": 0.45793758480325647, "frac_reward_zero_std": 0.5, "grad_norm": 0.13163456320762634, "kl": NaN, "learning_rate": 2.75879917184265e-07, "loss": -0.0129, "num_tokens": 44103893.0, "reward": 0.6750000715255737, "reward_std": 0.06123724579811096, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 1350 }, { "completion_length": 2737.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5712.0, "completions/max_terminated_length": 5712.0, "completions/mean_length": 2737.916748046875, "completions/mean_terminated_length": 2737.916748046875, "completions/min_length": 1634.0, "completions/min_terminated_length": 1634.0, "epoch": 0.45827679782903663, "frac_reward_zero_std": 0.5, "grad_norm": 0.5492063164710999, "kl": 0.0, "learning_rate": 2.757073844030366e-07, "loss": 0.031, "num_tokens": 44152000.0, "reward": 1.0500001907348633, "reward_std": 0.17606817185878754, "rewards/correctness_reward_func/mean": 0.75, "rewards/correctness_reward_func/std": 0.24308621883392334, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1351 }, { "completion_length": 2296.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 5401.0, "completions/max_terminated_length": 5401.0, "completions/mean_length": 2296.08349609375, "completions/mean_terminated_length": 2296.08349609375, "completions/min_length": 546.0, "completions/min_terminated_length": 546.0, "epoch": 0.45861601085481685, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.755348516218081e-07, "loss": 0.0, "num_tokens": 44196569.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1352 }, { "completion_length": 2422.166748046875, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 6024.0, "completions/mean_length": 3520.33349609375, "completions/mean_terminated_length": 2906.60009765625, "completions/min_length": 671.0, "completions/min_terminated_length": 671.0, "epoch": 0.458955223880597, "frac_reward_zero_std": 0.0, "grad_norm": 1.9853445291519165, "kl": NaN, "learning_rate": 2.753623188405797e-07, "loss": -0.1153, "num_tokens": 44241703.0, "reward": 1.0333333015441895, "reward_std": 0.3904307782649994, "rewards/correctness_reward_func/mean": 0.7833333015441895, "rewards/correctness_reward_func/std": 0.37618499994277954, "rewards/format_reward_func/mean": 0.2500000298023224, "rewards/format_reward_func/std": 0.11677484214305878, "step": 1353 }, { "completion_length": 2679.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4402.0, "completions/max_terminated_length": 4402.0, "completions/mean_length": 2679.916748046875, "completions/mean_terminated_length": 2679.916748046875, "completions/min_length": 1430.0, "completions/min_terminated_length": 1430.0, "epoch": 0.4592944369063772, "frac_reward_zero_std": 0.5, "grad_norm": 0.5928817391395569, "kl": 0.0, "learning_rate": 2.7518978605935126e-07, "loss": -0.0174, "num_tokens": 44288226.0, "reward": 1.1041667461395264, "reward_std": 0.23474276065826416, "rewards/correctness_reward_func/mean": 0.8166666030883789, "rewards/correctness_reward_func/std": 0.27579087018966675, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 1354 }, { "completion_length": 1118.3333435058594, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 5521.0, "completions/mean_length": 2765.58349609375, "completions/mean_terminated_length": 1491.111083984375, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.4596336499321574, "frac_reward_zero_std": 0.5, "grad_norm": 0.8411073088645935, "kl": NaN, "learning_rate": 2.750172532781228e-07, "loss": -0.0899, "num_tokens": 44309098.0, "reward": 0.8583334684371948, "reward_std": 0.33973026275634766, "rewards/correctness_reward_func/mean": 0.6333333253860474, "rewards/correctness_reward_func/std": 0.3892494738101959, "rewards/format_reward_func/mean": 0.22500000894069672, "rewards/format_reward_func/std": 0.13568010926246643, "step": 1355 }, { "completion_length": 2228.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3027.0, "completions/max_terminated_length": 3027.0, "completions/mean_length": 2228.58349609375, "completions/mean_terminated_length": 2228.58349609375, "completions/min_length": 991.0, "completions/min_terminated_length": 991.0, "epoch": 0.4599728629579376, "frac_reward_zero_std": 0.0, "grad_norm": 0.15095140039920807, "kl": 0.0, "learning_rate": 2.7484472049689437e-07, "loss": 0.0036, "num_tokens": 44350349.0, "reward": 1.183333396911621, "reward_std": 0.09246458858251572, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1356 }, { "completion_length": 1922.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5743.0, "completions/max_terminated_length": 5743.0, "completions/mean_length": 1922.166748046875, "completions/mean_terminated_length": 1922.166748046875, "completions/min_length": 387.0, "completions/min_terminated_length": 387.0, "epoch": 0.46031207598371776, "frac_reward_zero_std": 0.5, "grad_norm": 0.11898327618837357, "kl": 0.0, "learning_rate": 2.74672187715666e-07, "loss": -0.0034, "num_tokens": 44384131.0, "reward": 1.2666666507720947, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1357 }, { "completion_length": 1807.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4046.0, "completions/max_terminated_length": 4046.0, "completions/mean_length": 1807.3333740234375, "completions/mean_terminated_length": 1807.3333740234375, "completions/min_length": 651.0, "completions/min_terminated_length": 651.0, "epoch": 0.460651289009498, "frac_reward_zero_std": 0.5, "grad_norm": 0.05515772104263306, "kl": 0.0, "learning_rate": 2.7449965493443753e-07, "loss": -0.0, "num_tokens": 44415893.0, "reward": 0.7833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1358 }, { "completion_length": 2325.08349609375, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 5701.0, "completions/mean_length": 3972.33349609375, "completions/mean_terminated_length": 3100.111083984375, "completions/min_length": 1449.0, "completions/min_terminated_length": 1449.0, "epoch": 0.46099050203527814, "frac_reward_zero_std": 0.0, "grad_norm": 0.9940681457519531, "kl": NaN, "learning_rate": 2.743271221532091e-07, "loss": -0.0837, "num_tokens": 44453796.0, "reward": 0.7750000953674316, "reward_std": 0.3117799162864685, "rewards/correctness_reward_func/mean": 0.550000011920929, "rewards/correctness_reward_func/std": 0.4100997745990753, "rewards/format_reward_func/mean": 0.22500000894069672, "rewards/format_reward_func/std": 0.13568010926246643, "step": 1359 }, { "completion_length": 1108.9167175292969, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 1913.0, "completions/mean_length": 1658.0, "completions/mean_terminated_length": 1209.727294921875, "completions/min_length": 782.0, "completions/min_terminated_length": 782.0, "epoch": 0.46132971506105835, "frac_reward_zero_std": 0.5, "grad_norm": 0.32349249720573425, "kl": NaN, "learning_rate": 2.741545893719807e-07, "loss": -0.0061, "num_tokens": 44476313.0, "reward": 0.9083333015441895, "reward_std": 0.23327383399009705, "rewards/correctness_reward_func/mean": 0.6333333849906921, "rewards/correctness_reward_func/std": 0.4735424220561981, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 1360 }, { "completion_length": 1641.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2876.0, "completions/max_terminated_length": 2876.0, "completions/mean_length": 1641.5, "completions/mean_terminated_length": 1641.5, "completions/min_length": 1069.0, "completions/min_terminated_length": 1069.0, "epoch": 0.4616689280868385, "frac_reward_zero_std": 0.0, "grad_norm": 0.09256188571453094, "kl": 0.0, "learning_rate": 2.7398205659075224e-07, "loss": 0.0009, "num_tokens": 44507375.0, "reward": 1.2000000476837158, "reward_std": 0.08164963126182556, "rewards/correctness_reward_func/mean": 0.9000000357627869, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1361 }, { "completion_length": 1306.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3231.0, "completions/max_terminated_length": 3231.0, "completions/mean_length": 1306.75, "completions/mean_terminated_length": 1306.75, "completions/min_length": 456.0, "completions/min_terminated_length": 456.0, "epoch": 0.46200814111261873, "frac_reward_zero_std": 1.0, "grad_norm": 1.1537705546516008e-07, "kl": 0.0, "learning_rate": 2.7380952380952385e-07, "loss": 0.0, "num_tokens": 44534354.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1362 }, { "completion_length": 626.0000152587891, "completions/clipped_ratio": 0.0, "completions/max_length": 2083.0, "completions/max_terminated_length": 2083.0, "completions/mean_length": 626.0, "completions/mean_terminated_length": 626.0, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.4623473541383989, "frac_reward_zero_std": 0.5, "grad_norm": 0.336239755153656, "kl": 0.0, "learning_rate": 2.7363699102829534e-07, "loss": -0.0117, "num_tokens": 44550560.0, "reward": 0.8666666746139526, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.5666666626930237, "rewards/correctness_reward_func/std": 0.5033223032951355, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1363 }, { "completion_length": 1499.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2181.0, "completions/max_terminated_length": 2181.0, "completions/mean_length": 1499.416748046875, "completions/mean_terminated_length": 1499.416748046875, "completions/min_length": 853.0, "completions/min_terminated_length": 853.0, "epoch": 0.4626865671641791, "frac_reward_zero_std": 1.0, "grad_norm": 1.5041536016724422e-07, "kl": 0.0, "learning_rate": 2.7346445824706695e-07, "loss": 0.0, "num_tokens": 44582863.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1364 }, { "completion_length": 1534.5833740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 3616.0, "completions/mean_length": 2083.666748046875, "completions/mean_terminated_length": 1674.0909423828125, "completions/min_length": 560.0, "completions/min_terminated_length": 560.0, "epoch": 0.46302578018995927, "frac_reward_zero_std": 0.0, "grad_norm": 0.6371083855628967, "kl": NaN, "learning_rate": 2.732919254658385e-07, "loss": -0.0239, "num_tokens": 44611478.0, "reward": 0.9416667222976685, "reward_std": 0.5103103518486023, "rewards/correctness_reward_func/mean": 0.6666666865348816, "rewards/correctness_reward_func/std": 0.4923659861087799, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 1365 }, { "completion_length": 1200.8333587646484, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 4038.0, "completions/mean_length": 2299.0, "completions/mean_terminated_length": 1441.0, "completions/min_length": 475.0, "completions/min_terminated_length": 475.0, "epoch": 0.4633649932157395, "frac_reward_zero_std": 0.5, "grad_norm": 0.43205299973487854, "kl": NaN, "learning_rate": 2.7311939268461006e-07, "loss": -0.0365, "num_tokens": 44638068.0, "reward": 0.3166666626930237, "reward_std": 0.20165979862213135, "rewards/correctness_reward_func/mean": 0.06666667014360428, "rewards/correctness_reward_func/std": 0.2309401035308838, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.11677484214305878, "step": 1366 }, { "completion_length": 1765.3334350585938, "completions/clipped_ratio": 0.0, "completions/max_length": 4639.0, "completions/max_terminated_length": 4639.0, "completions/mean_length": 1765.3333740234375, "completions/mean_terminated_length": 1765.3333740234375, "completions/min_length": 370.0, "completions/min_terminated_length": 370.0, "epoch": 0.4637042062415197, "frac_reward_zero_std": 0.5, "grad_norm": 0.4781854748725891, "kl": 0.0, "learning_rate": 2.729468599033816e-07, "loss": 0.0287, "num_tokens": 44668984.0, "reward": 1.066666603088379, "reward_std": 0.20655910670757294, "rewards/correctness_reward_func/mean": 0.7666666507720947, "rewards/correctness_reward_func/std": 0.3700941801071167, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1367 }, { "completion_length": 2217.916748046875, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 4841.0, "completions/mean_length": 3865.166748046875, "completions/mean_terminated_length": 2957.22216796875, "completions/min_length": 881.0, "completions/min_terminated_length": 881.0, "epoch": 0.46404341926729986, "frac_reward_zero_std": 0.0, "grad_norm": 0.15069955587387085, "kl": NaN, "learning_rate": 2.727743271221532e-07, "loss": -0.0262, "num_tokens": 44712765.0, "reward": 0.22500000894069672, "reward_std": 0.13869690895080566, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.22500000894069672, "rewards/format_reward_func/std": 0.13568010926246643, "step": 1368 }, { "completion_length": 922.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1428.0, "completions/max_terminated_length": 1428.0, "completions/mean_length": 922.1666870117188, "completions/mean_terminated_length": 922.1666870117188, "completions/min_length": 580.0, "completions/min_terminated_length": 580.0, "epoch": 0.4643826322930801, "frac_reward_zero_std": 0.5, "grad_norm": 0.09589047729969025, "kl": 0.0, "learning_rate": 2.7260179434092477e-07, "loss": 0.0013, "num_tokens": 44741855.0, "reward": 1.1666667461395264, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1369 }, { "completion_length": 1946.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4973.0, "completions/max_terminated_length": 4973.0, "completions/mean_length": 1946.0833740234375, "completions/mean_terminated_length": 1946.0833740234375, "completions/min_length": 746.0, "completions/min_terminated_length": 746.0, "epoch": 0.46472184531886024, "frac_reward_zero_std": 1.0, "grad_norm": 1.5898771721367666e-07, "kl": 0.0, "learning_rate": 2.724292615596963e-07, "loss": 0.0, "num_tokens": 44777808.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1370 }, { "completion_length": 1022.5833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 2065.0, "completions/max_terminated_length": 2065.0, "completions/mean_length": 1022.5833740234375, "completions/mean_terminated_length": 1022.5833740234375, "completions/min_length": 591.0, "completions/min_terminated_length": 591.0, "epoch": 0.46506105834464045, "frac_reward_zero_std": 0.5, "grad_norm": 0.07959319651126862, "kl": 0.0, "learning_rate": 2.7225672877846787e-07, "loss": 0.0022, "num_tokens": 44798827.0, "reward": 1.133333444595337, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1371 }, { "completion_length": 1187.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 2370.0, "completions/max_terminated_length": 2370.0, "completions/mean_length": 1187.25, "completions/mean_terminated_length": 1187.25, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.4654002713704206, "frac_reward_zero_std": 0.0, "grad_norm": 0.11818695813417435, "kl": 0.0, "learning_rate": 2.720841959972395e-07, "loss": 0.0003, "num_tokens": 44824456.0, "reward": 1.1083333492279053, "reward_std": 0.0903695821762085, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 1372 }, { "completion_length": 1399.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3023.0, "completions/max_terminated_length": 3023.0, "completions/mean_length": 1399.5, "completions/mean_terminated_length": 1399.5, "completions/min_length": 614.0, "completions/min_terminated_length": 614.0, "epoch": 0.46573948439620083, "frac_reward_zero_std": 0.5, "grad_norm": 0.06676678359508514, "kl": 0.0, "learning_rate": 2.7191166321601103e-07, "loss": -0.0007, "num_tokens": 44852140.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1373 }, { "completion_length": 1589.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2784.0, "completions/max_terminated_length": 2784.0, "completions/mean_length": 1589.0, "completions/mean_terminated_length": 1589.0, "completions/min_length": 960.0, "completions/min_terminated_length": 960.0, "epoch": 0.466078697421981, "frac_reward_zero_std": 1.0, "grad_norm": 2.839094861428748e-07, "kl": 0.0, "learning_rate": 2.717391304347826e-07, "loss": 0.0, "num_tokens": 44882992.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1374 }, { "completion_length": 871.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1112.0, "completions/max_terminated_length": 1112.0, "completions/mean_length": 871.4166870117188, "completions/mean_terminated_length": 871.4166870117188, "completions/min_length": 587.0, "completions/min_terminated_length": 587.0, "epoch": 0.4664179104477612, "frac_reward_zero_std": 0.5, "grad_norm": 0.05492303892970085, "kl": 0.0, "learning_rate": 2.7156659765355414e-07, "loss": 0.0003, "num_tokens": 44907483.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1375 }, { "completion_length": 558.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 917.0, "completions/max_terminated_length": 917.0, "completions/mean_length": 558.9166870117188, "completions/mean_terminated_length": 558.9166870117188, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 0.46675712347354137, "frac_reward_zero_std": 0.5, "grad_norm": 0.033701442182064056, "kl": 0.0, "learning_rate": 2.7139406487232574e-07, "loss": -0.0002, "num_tokens": 44928500.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1376 }, { "completion_length": 1283.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 1762.0, "completions/max_terminated_length": 1762.0, "completions/mean_length": 1283.416748046875, "completions/mean_terminated_length": 1283.416748046875, "completions/min_length": 770.0, "completions/min_terminated_length": 770.0, "epoch": 0.4670963364993216, "frac_reward_zero_std": 0.5, "grad_norm": 0.4165017008781433, "kl": 0.0, "learning_rate": 2.712215320910973e-07, "loss": -0.0063, "num_tokens": 44955937.0, "reward": 1.2166666984558105, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1377 }, { "completion_length": 2015.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4422.0, "completions/max_terminated_length": 4422.0, "completions/mean_length": 2015.916748046875, "completions/mean_terminated_length": 2015.916748046875, "completions/min_length": 645.0, "completions/min_terminated_length": 645.0, "epoch": 0.46743554952510175, "frac_reward_zero_std": 0.5, "grad_norm": 0.09320026636123657, "kl": 0.0, "learning_rate": 2.7104899930986885e-07, "loss": -0.0011, "num_tokens": 44990808.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1378 }, { "completion_length": 1616.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2593.0, "completions/max_terminated_length": 2593.0, "completions/mean_length": 1616.5, "completions/mean_terminated_length": 1616.5, "completions/min_length": 872.0, "completions/min_terminated_length": 872.0, "epoch": 0.46777476255088196, "frac_reward_zero_std": 0.5, "grad_norm": 0.09291809052228928, "kl": 0.0, "learning_rate": 2.7087646652864045e-07, "loss": -0.0005, "num_tokens": 45023298.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1379 }, { "completion_length": 1526.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2874.0, "completions/max_terminated_length": 2874.0, "completions/mean_length": 1526.416748046875, "completions/mean_terminated_length": 1526.416748046875, "completions/min_length": 871.0, "completions/min_terminated_length": 871.0, "epoch": 0.4681139755766621, "frac_reward_zero_std": 0.0, "grad_norm": 0.12247850745916367, "kl": 0.0, "learning_rate": 2.70703933747412e-07, "loss": 0.0033, "num_tokens": 45051101.0, "reward": 1.25, "reward_std": 0.09246455878019333, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1380 }, { "completion_length": 2169.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5433.0, "completions/max_terminated_length": 5433.0, "completions/mean_length": 2169.25, "completions/mean_terminated_length": 2169.25, "completions/min_length": 683.0, "completions/min_terminated_length": 683.0, "epoch": 0.46845318860244234, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.7053140096618356e-07, "loss": 0.0, "num_tokens": 45086630.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1381 }, { "completion_length": 2399.0001220703125, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4132.0, "completions/mean_length": 2948.08349609375, "completions/mean_terminated_length": 2617.091064453125, "completions/min_length": 841.0, "completions/min_terminated_length": 841.0, "epoch": 0.4687924016282225, "frac_reward_zero_std": 0.5, "grad_norm": 0.28633424639701843, "kl": NaN, "learning_rate": 2.703588681849551e-07, "loss": -0.0499, "num_tokens": 45128084.0, "reward": 1.0750000476837158, "reward_std": 0.2602882981300354, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.2696799635887146, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 1382 }, { "completion_length": 2954.25, "completions/clipped_ratio": 0.0, "completions/max_length": 6116.0, "completions/max_terminated_length": 6116.0, "completions/mean_length": 2954.25, "completions/mean_terminated_length": 2954.25, "completions/min_length": 977.0, "completions/min_terminated_length": 977.0, "epoch": 0.4691316146540027, "frac_reward_zero_std": 1.0, "grad_norm": 1.1254859799691985e-07, "kl": 0.0, "learning_rate": 2.701863354037267e-07, "loss": 0.0, "num_tokens": 45177077.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1383 }, { "completion_length": 871.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1748.0, "completions/max_terminated_length": 1748.0, "completions/mean_length": 871.8333740234375, "completions/mean_terminated_length": 871.8333740234375, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "epoch": 0.4694708276797829, "frac_reward_zero_std": 1.0, "grad_norm": 1.476473130423983e-07, "kl": 0.0, "learning_rate": 2.7001380262249827e-07, "loss": 0.0, "num_tokens": 45196725.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1384 }, { "completion_length": 1569.3333740234375, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 5607.0, "completions/mean_length": 2667.5, "completions/mean_terminated_length": 1883.2000732421875, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.4698100407055631, "frac_reward_zero_std": 0.5, "grad_norm": 0.5347140431404114, "kl": NaN, "learning_rate": 2.698412698412698e-07, "loss": -0.0512, "num_tokens": 45226273.0, "reward": 0.949999988079071, "reward_std": 0.279284805059433, "rewards/correctness_reward_func/mean": 0.699999988079071, "rewards/correctness_reward_func/std": 0.43064433336257935, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.11677484214305878, "step": 1385 }, { "completion_length": 663.6666717529297, "completions/clipped_ratio": 0.0, "completions/max_length": 1326.0, "completions/max_terminated_length": 1326.0, "completions/mean_length": 663.6666870117188, "completions/mean_terminated_length": 663.6666870117188, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.4701492537313433, "frac_reward_zero_std": 1.0, "grad_norm": 1.0044725229363394e-07, "kl": 0.0, "learning_rate": 2.696687370600414e-07, "loss": 0.0, "num_tokens": 45249843.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1386 }, { "completion_length": 1092.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3056.0, "completions/max_terminated_length": 3056.0, "completions/mean_length": 1092.0833740234375, "completions/mean_terminated_length": 1092.0833740234375, "completions/min_length": 596.0, "completions/min_terminated_length": 596.0, "epoch": 0.47048846675712347, "frac_reward_zero_std": 0.5, "grad_norm": 0.05867522209882736, "kl": 0.0, "learning_rate": 2.69496204278813e-07, "loss": 0.0014, "num_tokens": 45276526.0, "reward": 1.2166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298573225736618, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1387 }, { "completion_length": 1544.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2361.0, "completions/max_terminated_length": 2361.0, "completions/mean_length": 1544.8333740234375, "completions/mean_terminated_length": 1544.8333740234375, "completions/min_length": 529.0, "completions/min_terminated_length": 529.0, "epoch": 0.4708276797829037, "frac_reward_zero_std": 1.0, "grad_norm": 1.75040440808516e-07, "kl": 0.0, "learning_rate": 2.693236714975845e-07, "loss": 0.0, "num_tokens": 45307676.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1388 }, { "completion_length": 1142.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2700.0, "completions/max_terminated_length": 2700.0, "completions/mean_length": 1142.5833740234375, "completions/mean_terminated_length": 1142.5833740234375, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "epoch": 0.47116689280868385, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.691511387163561e-07, "loss": 0.0, "num_tokens": 45332469.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1389 }, { "completion_length": 1232.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3272.0, "completions/max_terminated_length": 3272.0, "completions/mean_length": 1232.5, "completions/mean_terminated_length": 1232.5, "completions/min_length": 631.0, "completions/min_terminated_length": 631.0, "epoch": 0.47150610583446406, "frac_reward_zero_std": 0.0, "grad_norm": 0.11917827278375626, "kl": 0.0, "learning_rate": 2.6897860593512764e-07, "loss": 0.0013, "num_tokens": 45356457.0, "reward": 1.2333333492279053, "reward_std": 0.0955970510840416, "rewards/correctness_reward_func/mean": 0.9333333373069763, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1390 }, { "completion_length": 1050.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 1873.0, "completions/max_terminated_length": 1873.0, "completions/mean_length": 1050.25, "completions/mean_terminated_length": 1050.25, "completions/min_length": 437.0, "completions/min_terminated_length": 437.0, "epoch": 0.4718453188602442, "frac_reward_zero_std": 0.5, "grad_norm": 0.3340080678462982, "kl": 0.0, "learning_rate": 2.6880607315389925e-07, "loss": -0.0025, "num_tokens": 45378738.0, "reward": 0.7833334803581238, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.4833333194255829, "rewards/correctness_reward_func/std": 0.43029236793518066, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1391 }, { "completion_length": 2371.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 6558.0, "completions/max_terminated_length": 6558.0, "completions/mean_length": 2371.58349609375, "completions/mean_terminated_length": 2371.58349609375, "completions/min_length": 794.0, "completions/min_terminated_length": 794.0, "epoch": 0.47218453188602444, "frac_reward_zero_std": 0.5, "grad_norm": 0.6329807043075562, "kl": 0.0, "learning_rate": 2.686335403726708e-07, "loss": 0.0086, "num_tokens": 45419053.0, "reward": 0.9833334684371948, "reward_std": 0.222860187292099, "rewards/correctness_reward_func/mean": 0.6833333969116211, "rewards/correctness_reward_func/std": 0.32427075505256653, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1392 }, { "completion_length": 836.5833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1138.0, "completions/max_terminated_length": 1138.0, "completions/mean_length": 836.5833740234375, "completions/mean_terminated_length": 836.5833740234375, "completions/min_length": 527.0, "completions/min_terminated_length": 527.0, "epoch": 0.4725237449118046, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.6846100759144235e-07, "loss": 0.0, "num_tokens": 45441650.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1393 }, { "completion_length": 2054.0, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 5777.0, "completions/mean_length": 3152.166748046875, "completions/mean_terminated_length": 2464.800048828125, "completions/min_length": 929.0, "completions/min_terminated_length": 929.0, "epoch": 0.4728629579375848, "frac_reward_zero_std": 0.0, "grad_norm": 0.5216564536094666, "kl": NaN, "learning_rate": 2.6828847481021396e-07, "loss": -0.0972, "num_tokens": 45477248.0, "reward": 1.0333333015441895, "reward_std": 0.3904307782649994, "rewards/correctness_reward_func/mean": 0.7833333015441895, "rewards/correctness_reward_func/std": 0.37618499994277954, "rewards/format_reward_func/mean": 0.2500000298023224, "rewards/format_reward_func/std": 0.11677484214305878, "step": 1394 }, { "completion_length": 1630.8333740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 3604.0, "completions/mean_length": 2179.916748046875, "completions/mean_terminated_length": 1779.0909423828125, "completions/min_length": 619.0, "completions/min_terminated_length": 619.0, "epoch": 0.473202170963365, "frac_reward_zero_std": 0.0, "grad_norm": 0.5719637274742126, "kl": NaN, "learning_rate": 2.681159420289855e-07, "loss": -0.0263, "num_tokens": 45509598.0, "reward": 1.0916666984558105, "reward_std": 0.34151846170425415, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.3857303261756897, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 1395 }, { "completion_length": 1958.3334350585938, "completions/clipped_ratio": 0.0, "completions/max_length": 3465.0, "completions/max_terminated_length": 3465.0, "completions/mean_length": 1958.3333740234375, "completions/mean_terminated_length": 1958.3333740234375, "completions/min_length": 712.0, "completions/min_terminated_length": 712.0, "epoch": 0.4735413839891452, "frac_reward_zero_std": 0.5, "grad_norm": 0.0869276225566864, "kl": 0.0, "learning_rate": 2.6794340924775706e-07, "loss": 0.0005, "num_tokens": 45547174.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1396 }, { "completion_length": 862.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2373.0, "completions/max_terminated_length": 2373.0, "completions/mean_length": 862.4166870117188, "completions/mean_terminated_length": 862.4166870117188, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.47388059701492535, "frac_reward_zero_std": 1.0, "grad_norm": 1.951799077914984e-07, "kl": 0.0, "learning_rate": 2.677708764665286e-07, "loss": 0.0, "num_tokens": 45565659.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1397 }, { "completion_length": 1301.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2271.0, "completions/max_terminated_length": 2271.0, "completions/mean_length": 1301.8333740234375, "completions/mean_terminated_length": 1301.8333740234375, "completions/min_length": 755.0, "completions/min_terminated_length": 755.0, "epoch": 0.47421981004070557, "frac_reward_zero_std": 0.0, "grad_norm": 0.5524554252624512, "kl": 0.0, "learning_rate": 2.675983436853002e-07, "loss": 0.0115, "num_tokens": 45591679.0, "reward": 1.066666841506958, "reward_std": 0.3098386526107788, "rewards/correctness_reward_func/mean": 0.7666666507720947, "rewards/correctness_reward_func/std": 0.3700941801071167, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1398 }, { "completion_length": 1022.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2235.0, "completions/max_terminated_length": 2235.0, "completions/mean_length": 1022.9166870117188, "completions/mean_terminated_length": 1022.9166870117188, "completions/min_length": 506.0, "completions/min_terminated_length": 506.0, "epoch": 0.47455902306648573, "frac_reward_zero_std": 0.5, "grad_norm": 0.09795229136943817, "kl": 0.0, "learning_rate": 2.674258109040717e-07, "loss": -0.001, "num_tokens": 45615612.0, "reward": 1.1500000953674316, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.8500000834465027, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1399 }, { "completion_length": 1622.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3974.0, "completions/max_terminated_length": 3974.0, "completions/mean_length": 1622.25, "completions/mean_terminated_length": 1622.25, "completions/min_length": 694.0, "completions/min_terminated_length": 694.0, "epoch": 0.47489823609226595, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.6725327812284333e-07, "loss": 0.0, "num_tokens": 45644187.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1400 }, { "completion_length": 2865.166748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4689.0, "completions/mean_length": 3414.25, "completions/mean_terminated_length": 3125.636474609375, "completions/min_length": 1042.0, "completions/min_terminated_length": 1042.0, "epoch": 0.4752374491180461, "frac_reward_zero_std": 0.0, "grad_norm": 0.7283228635787964, "kl": NaN, "learning_rate": 2.670807453416149e-07, "loss": -0.0311, "num_tokens": 45691661.0, "reward": 0.8416666984558105, "reward_std": 0.4827354848384857, "rewards/correctness_reward_func/mean": 0.5666666626930237, "rewards/correctness_reward_func/std": 0.42497774958610535, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 1401 }, { "completion_length": 455.5833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 745.0, "completions/max_terminated_length": 745.0, "completions/mean_length": 455.5833435058594, "completions/mean_terminated_length": 455.5833435058594, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.4755766621438263, "frac_reward_zero_std": 1.0, "grad_norm": 2.0006343959266815e-07, "kl": 0.0, "learning_rate": 2.669082125603865e-07, "loss": 0.0, "num_tokens": 45705252.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1402 }, { "completion_length": 3351.75, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 6313.0, "completions/mean_length": 4449.9169921875, "completions/mean_terminated_length": 4022.10009765625, "completions/min_length": 2641.0, "completions/min_terminated_length": 2641.0, "epoch": 0.47591587516960654, "frac_reward_zero_std": 0.5, "grad_norm": 0.5647222995758057, "kl": NaN, "learning_rate": 2.66735679779158e-07, "loss": -0.02, "num_tokens": 45762789.0, "reward": 0.9458334445953369, "reward_std": 0.25904473662376404, "rewards/correctness_reward_func/mean": 0.6833333969116211, "rewards/correctness_reward_func/std": 0.32427075505256653, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.09323723614215851, "step": 1403 }, { "completion_length": 1160.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2446.0, "completions/max_terminated_length": 2446.0, "completions/mean_length": 1160.666748046875, "completions/mean_terminated_length": 1160.666748046875, "completions/min_length": 678.0, "completions/min_terminated_length": 678.0, "epoch": 0.4762550881953867, "frac_reward_zero_std": 0.5, "grad_norm": 0.07784174382686615, "kl": 0.0, "learning_rate": 2.665631469979296e-07, "loss": -0.0011, "num_tokens": 45787151.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1404 }, { "completion_length": 954.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1779.0, "completions/max_terminated_length": 1779.0, "completions/mean_length": 954.3333740234375, "completions/mean_terminated_length": 954.3333740234375, "completions/min_length": 518.0, "completions/min_terminated_length": 518.0, "epoch": 0.4765943012211669, "frac_reward_zero_std": 1.0, "grad_norm": 1.3277258403832093e-07, "kl": 0.0, "learning_rate": 2.6639061421670115e-07, "loss": 0.0, "num_tokens": 45810009.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1405 }, { "completion_length": 1001.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2810.0, "completions/max_terminated_length": 2810.0, "completions/mean_length": 1001.0833740234375, "completions/mean_terminated_length": 1001.0833740234375, "completions/min_length": 494.0, "completions/min_terminated_length": 494.0, "epoch": 0.4769335142469471, "frac_reward_zero_std": 0.5, "grad_norm": 0.08954577147960663, "kl": 0.0, "learning_rate": 2.6621808143547275e-07, "loss": 0.003, "num_tokens": 45832600.0, "reward": 1.25, "reward_std": 0.05477222427725792, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1406 }, { "completion_length": 3319.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 5863.0, "completions/max_terminated_length": 5863.0, "completions/mean_length": 3319.33349609375, "completions/mean_terminated_length": 3319.33349609375, "completions/min_length": 1212.0, "completions/min_terminated_length": 1212.0, "epoch": 0.4772727272727273, "frac_reward_zero_std": 0.5, "grad_norm": 0.5688414573669434, "kl": 0.0, "learning_rate": 2.660455486542443e-07, "loss": -0.0096, "num_tokens": 45879584.0, "reward": 0.7166666984558105, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.5149286985397339, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1407 }, { "completion_length": 810.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1457.0, "completions/max_terminated_length": 1457.0, "completions/mean_length": 810.25, "completions/mean_terminated_length": 810.25, "completions/min_length": 542.0, "completions/min_terminated_length": 542.0, "epoch": 0.47761194029850745, "frac_reward_zero_std": 1.0, "grad_norm": 8.647364779790223e-08, "kl": 0.0, "learning_rate": 2.6587301587301586e-07, "loss": 0.0, "num_tokens": 45903581.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1408 }, { "completion_length": 615.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 843.0, "completions/max_terminated_length": 843.0, "completions/mean_length": 615.3333740234375, "completions/mean_terminated_length": 615.3333740234375, "completions/min_length": 436.0, "completions/min_terminated_length": 436.0, "epoch": 0.47795115332428767, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.6570048309178746e-07, "loss": 0.0, "num_tokens": 45925179.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1409 }, { "completion_length": 664.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1551.0, "completions/max_terminated_length": 1551.0, "completions/mean_length": 664.4166870117188, "completions/mean_terminated_length": 664.4166870117188, "completions/min_length": 472.0, "completions/min_terminated_length": 472.0, "epoch": 0.47829036635006783, "frac_reward_zero_std": 1.0, "grad_norm": 1.631043318184311e-07, "kl": 0.0, "learning_rate": 2.6552795031055896e-07, "loss": 0.0, "num_tokens": 45945254.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1410 }, { "completion_length": 697.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1125.0, "completions/max_terminated_length": 1125.0, "completions/mean_length": 697.8333740234375, "completions/mean_terminated_length": 697.8333740234375, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "epoch": 0.47862957937584805, "frac_reward_zero_std": 0.5, "grad_norm": 0.03392680734395981, "kl": 0.0, "learning_rate": 2.6535541752933057e-07, "loss": -0.0005, "num_tokens": 45968226.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1411 }, { "completion_length": 1490.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3882.0, "completions/max_terminated_length": 3882.0, "completions/mean_length": 1490.166748046875, "completions/mean_terminated_length": 1490.166748046875, "completions/min_length": 871.0, "completions/min_terminated_length": 871.0, "epoch": 0.4789687924016282, "frac_reward_zero_std": 0.5, "grad_norm": 0.10917965322732925, "kl": 0.0, "learning_rate": 2.651828847481021e-07, "loss": -0.0056, "num_tokens": 45996188.0, "reward": 1.2166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298573225736618, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1412 }, { "completion_length": 650.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 879.0, "completions/max_terminated_length": 879.0, "completions/mean_length": 650.4166870117188, "completions/mean_terminated_length": 650.4166870117188, "completions/min_length": 443.0, "completions/min_terminated_length": 443.0, "epoch": 0.4793080054274084, "frac_reward_zero_std": 0.5, "grad_norm": 0.06631017476320267, "kl": 0.0, "learning_rate": 2.6501035196687373e-07, "loss": -0.0005, "num_tokens": 46014763.0, "reward": 1.1666667461395264, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1413 }, { "completion_length": 1473.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3696.0, "completions/max_terminated_length": 3696.0, "completions/mean_length": 1473.5, "completions/mean_terminated_length": 1473.5, "completions/min_length": 609.0, "completions/min_terminated_length": 609.0, "epoch": 0.4796472184531886, "frac_reward_zero_std": 0.5, "grad_norm": 0.35284164547920227, "kl": 0.0, "learning_rate": 2.6483781918564523e-07, "loss": -0.0096, "num_tokens": 46043635.0, "reward": 1.2166666984558105, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1414 }, { "completion_length": 1586.0000915527344, "completions/clipped_ratio": 0.0, "completions/max_length": 4485.0, "completions/max_terminated_length": 4485.0, "completions/mean_length": 1586.0, "completions/mean_terminated_length": 1586.0, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "epoch": 0.4799864314789688, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.6466528640441683e-07, "loss": 0.0, "num_tokens": 46076995.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1415 }, { "completion_length": 818.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1146.0, "completions/max_terminated_length": 1146.0, "completions/mean_length": 818.75, "completions/mean_terminated_length": 818.75, "completions/min_length": 593.0, "completions/min_terminated_length": 593.0, "epoch": 0.48032564450474896, "frac_reward_zero_std": 0.5, "grad_norm": 0.27206286787986755, "kl": 0.0, "learning_rate": 2.644927536231884e-07, "loss": 0.0005, "num_tokens": 46095772.0, "reward": 1.0333335399627686, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.7333333492279053, "rewards/correctness_reward_func/std": 0.2309401035308838, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1416 }, { "completion_length": 1785.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3139.0, "completions/max_terminated_length": 3139.0, "completions/mean_length": 1785.0833740234375, "completions/mean_terminated_length": 1785.0833740234375, "completions/min_length": 554.0, "completions/min_terminated_length": 554.0, "epoch": 0.4806648575305292, "frac_reward_zero_std": 0.0, "grad_norm": 0.1427237093448639, "kl": 0.0, "learning_rate": 2.6432022084196e-07, "loss": -0.0019, "num_tokens": 46130201.0, "reward": 1.183333396911621, "reward_std": 0.10641201585531235, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1417 }, { "completion_length": 964.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1505.0, "completions/max_terminated_length": 1505.0, "completions/mean_length": 964.25, "completions/mean_terminated_length": 964.25, "completions/min_length": 518.0, "completions/min_terminated_length": 518.0, "epoch": 0.48100407055630934, "frac_reward_zero_std": 1.0, "grad_norm": 1.776729305902336e-07, "kl": 0.0, "learning_rate": 2.641476880607315e-07, "loss": 0.0, "num_tokens": 46155470.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1418 }, { "completion_length": 812.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1435.0, "completions/max_terminated_length": 1435.0, "completions/mean_length": 812.8333740234375, "completions/mean_terminated_length": 812.8333740234375, "completions/min_length": 484.0, "completions/min_terminated_length": 484.0, "epoch": 0.48134328358208955, "frac_reward_zero_std": 1.0, "grad_norm": 1.196096235389632e-07, "kl": 0.0, "learning_rate": 2.639751552795031e-07, "loss": 0.0, "num_tokens": 46174860.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1419 }, { "completion_length": 616.5833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 840.0, "completions/max_terminated_length": 840.0, "completions/mean_length": 616.5833740234375, "completions/mean_terminated_length": 616.5833740234375, "completions/min_length": 414.0, "completions/min_terminated_length": 414.0, "epoch": 0.4816824966078697, "frac_reward_zero_std": 0.5, "grad_norm": 0.25642818212509155, "kl": 0.0, "learning_rate": 2.6380262249827465e-07, "loss": -0.0015, "num_tokens": 46193347.0, "reward": 1.120833396911621, "reward_std": 0.19391795992851257, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.28069180250167847, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 1420 }, { "completion_length": 1401.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2987.0, "completions/max_terminated_length": 2987.0, "completions/mean_length": 1401.8333740234375, "completions/mean_terminated_length": 1401.8333740234375, "completions/min_length": 528.0, "completions/min_terminated_length": 528.0, "epoch": 0.48202170963364993, "frac_reward_zero_std": 0.5, "grad_norm": 0.07382780313491821, "kl": 0.0, "learning_rate": 2.636300897170462e-07, "loss": -0.0006, "num_tokens": 46219055.0, "reward": 1.133333444595337, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1421 }, { "completion_length": 962.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1606.0, "completions/max_terminated_length": 1606.0, "completions/mean_length": 962.4166870117188, "completions/mean_terminated_length": 962.4166870117188, "completions/min_length": 563.0, "completions/min_terminated_length": 563.0, "epoch": 0.48236092265943015, "frac_reward_zero_std": 0.5, "grad_norm": 0.07820718735456467, "kl": 0.0, "learning_rate": 2.6345755693581776e-07, "loss": -0.0017, "num_tokens": 46242328.0, "reward": 1.2666666507720947, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1422 }, { "completion_length": 974.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2354.0, "completions/max_terminated_length": 2354.0, "completions/mean_length": 974.4166870117188, "completions/mean_terminated_length": 974.4166870117188, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "epoch": 0.4827001356852103, "frac_reward_zero_std": 0.5, "grad_norm": 0.09361352771520615, "kl": 0.0, "learning_rate": 2.6328502415458936e-07, "loss": -0.0013, "num_tokens": 46269129.0, "reward": 1.2333333492279053, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.9333333373069763, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1423 }, { "completion_length": 1025.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2414.0, "completions/max_terminated_length": 2414.0, "completions/mean_length": 1025.25, "completions/mean_terminated_length": 1025.25, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.4830393487109905, "frac_reward_zero_std": 1.0, "grad_norm": 1.785646475127578e-07, "kl": 0.0, "learning_rate": 2.6311249137336097e-07, "loss": 0.0, "num_tokens": 46294518.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1424 }, { "completion_length": 1438.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2220.0, "completions/max_terminated_length": 2220.0, "completions/mean_length": 1438.5, "completions/mean_terminated_length": 1438.5, "completions/min_length": 730.0, "completions/min_terminated_length": 730.0, "epoch": 0.4833785617367707, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.6293995859213247e-07, "loss": 0.0, "num_tokens": 46320432.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1425 }, { "completion_length": 1559.6666870117188, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5180.0, "completions/mean_length": 2108.75, "completions/mean_terminated_length": 1701.45458984375, "completions/min_length": 505.0, "completions/min_terminated_length": 505.0, "epoch": 0.4837177747625509, "frac_reward_zero_std": 0.0, "grad_norm": 0.4424091875553131, "kl": NaN, "learning_rate": 2.6276742581090407e-07, "loss": -0.0118, "num_tokens": 46351796.0, "reward": 0.824999988079071, "reward_std": 0.2761762738227844, "rewards/correctness_reward_func/mean": 0.550000011920929, "rewards/correctness_reward_func/std": 0.49082493782043457, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 1426 }, { "completion_length": 977.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1627.0, "completions/max_terminated_length": 1627.0, "completions/mean_length": 977.75, "completions/mean_terminated_length": 977.75, "completions/min_length": 684.0, "completions/min_terminated_length": 684.0, "epoch": 0.48405698778833106, "frac_reward_zero_std": 0.5, "grad_norm": 0.06053897365927696, "kl": 0.0, "learning_rate": 2.625948930296756e-07, "loss": 0.0005, "num_tokens": 46374125.0, "reward": 0.7166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1427 }, { "completion_length": 2106.3334350585938, "completions/clipped_ratio": 0.0, "completions/max_length": 4392.0, "completions/max_terminated_length": 4392.0, "completions/mean_length": 2106.33349609375, "completions/mean_terminated_length": 2106.33349609375, "completions/min_length": 417.0, "completions/min_terminated_length": 417.0, "epoch": 0.4843962008141113, "frac_reward_zero_std": 0.5, "grad_norm": 0.12771901488304138, "kl": 0.0, "learning_rate": 2.6242236024844723e-07, "loss": -0.0007, "num_tokens": 46409829.0, "reward": 1.133333444595337, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1428 }, { "completion_length": 2233.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 6345.0, "completions/max_terminated_length": 6345.0, "completions/mean_length": 2233.83349609375, "completions/mean_terminated_length": 2233.83349609375, "completions/min_length": 946.0, "completions/min_terminated_length": 946.0, "epoch": 0.48473541383989144, "frac_reward_zero_std": 0.5, "grad_norm": 0.09867383539676666, "kl": 0.0, "learning_rate": 2.6224982746721873e-07, "loss": -0.0008, "num_tokens": 46450921.0, "reward": 1.25, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1429 }, { "completion_length": 1652.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 4448.0, "completions/max_terminated_length": 4448.0, "completions/mean_length": 1652.666748046875, "completions/mean_terminated_length": 1652.666748046875, "completions/min_length": 649.0, "completions/min_terminated_length": 649.0, "epoch": 0.48507462686567165, "frac_reward_zero_std": 0.0, "grad_norm": 0.10979257524013519, "kl": 0.0, "learning_rate": 2.6207729468599034e-07, "loss": -0.0004, "num_tokens": 46487067.0, "reward": 1.1500000953674316, "reward_std": 0.09246458858251572, "rewards/correctness_reward_func/mean": 0.8500000834465027, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1430 }, { "completion_length": 853.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1882.0, "completions/max_terminated_length": 1882.0, "completions/mean_length": 853.4166870117188, "completions/mean_terminated_length": 853.4166870117188, "completions/min_length": 355.0, "completions/min_terminated_length": 355.0, "epoch": 0.4854138398914518, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.619047619047619e-07, "loss": 0.0, "num_tokens": 46508132.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1431 }, { "completion_length": 1674.5000610351562, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 2940.0, "completions/mean_length": 2223.58349609375, "completions/mean_terminated_length": 1826.727294921875, "completions/min_length": 711.0, "completions/min_terminated_length": 711.0, "epoch": 0.48575305291723203, "frac_reward_zero_std": 0.5, "grad_norm": 0.3374686539173126, "kl": NaN, "learning_rate": 2.6173222912353344e-07, "loss": -0.0265, "num_tokens": 46539626.0, "reward": 0.6083333492279053, "reward_std": 0.22453658282756805, "rewards/correctness_reward_func/mean": 0.3333333432674408, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 1432 }, { "completion_length": 1976.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4573.0, "completions/max_terminated_length": 4573.0, "completions/mean_length": 1976.8333740234375, "completions/mean_terminated_length": 1976.8333740234375, "completions/min_length": 782.0, "completions/min_terminated_length": 782.0, "epoch": 0.4860922659430122, "frac_reward_zero_std": 0.5, "grad_norm": 0.35924872756004333, "kl": 0.0, "learning_rate": 2.61559696342305e-07, "loss": -0.0016, "num_tokens": 46577844.0, "reward": 0.6500000953674316, "reward_std": 0.17606817185878754, "rewards/correctness_reward_func/mean": 0.3499999940395355, "rewards/correctness_reward_func/std": 0.4358898997306824, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1433 }, { "completion_length": 1453.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3133.0, "completions/max_terminated_length": 3133.0, "completions/mean_length": 1453.0833740234375, "completions/mean_terminated_length": 1453.0833740234375, "completions/min_length": 625.0, "completions/min_terminated_length": 625.0, "epoch": 0.4864314789687924, "frac_reward_zero_std": 1.0, "grad_norm": 2.2874493765812076e-07, "kl": 0.0, "learning_rate": 2.613871635610766e-07, "loss": 0.0, "num_tokens": 46610065.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1434 }, { "completion_length": 955.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1372.0, "completions/max_terminated_length": 1372.0, "completions/mean_length": 955.3333740234375, "completions/mean_terminated_length": 955.3333740234375, "completions/min_length": 541.0, "completions/min_terminated_length": 541.0, "epoch": 0.48677069199457257, "frac_reward_zero_std": 0.5, "grad_norm": 0.44394463300704956, "kl": 0.0, "learning_rate": 2.6121463077984815e-07, "loss": 0.0007, "num_tokens": 46629671.0, "reward": 0.9000000953674316, "reward_std": 0.21908903121948242, "rewards/correctness_reward_func/mean": 0.6000000238418579, "rewards/correctness_reward_func/std": 0.36181363463401794, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1435 }, { "completion_length": 1287.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2031.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1287.5, "completions/mean_terminated_length": 1287.5, "completions/min_length": 922.0, "completions/min_terminated_length": 922.0, "epoch": 0.4871099050203528, "frac_reward_zero_std": 0.5, "grad_norm": 0.35083362460136414, "kl": 0.0, "learning_rate": 2.610420979986197e-07, "loss": -0.0003, "num_tokens": 46656029.0, "reward": 0.9500000476837158, "reward_std": 0.27386125922203064, "rewards/correctness_reward_func/mean": 0.6499999761581421, "rewards/correctness_reward_func/std": 0.40113475918769836, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1436 }, { "completion_length": 2633.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 6382.0, "completions/max_terminated_length": 6382.0, "completions/mean_length": 2633.666748046875, "completions/mean_terminated_length": 2633.666748046875, "completions/min_length": 939.0, "completions/min_terminated_length": 939.0, "epoch": 0.48744911804613295, "frac_reward_zero_std": 0.5, "grad_norm": 0.5561209917068481, "kl": 0.0, "learning_rate": 2.6086956521739126e-07, "loss": -0.018, "num_tokens": 46699579.0, "reward": 1.2166666984558105, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1437 }, { "completion_length": 1906.0834350585938, "completions/clipped_ratio": 0.0, "completions/max_length": 4259.0, "completions/max_terminated_length": 4259.0, "completions/mean_length": 1906.0833740234375, "completions/mean_terminated_length": 1906.0833740234375, "completions/min_length": 705.0, "completions/min_terminated_length": 705.0, "epoch": 0.48778833107191316, "frac_reward_zero_std": 0.5, "grad_norm": 0.635406494140625, "kl": 0.0, "learning_rate": 2.6069703243616287e-07, "loss": 0.0215, "num_tokens": 46736666.0, "reward": 0.7166666984558105, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.5149286985397339, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1438 }, { "completion_length": 1124.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2097.0, "completions/max_terminated_length": 2097.0, "completions/mean_length": 1124.166748046875, "completions/mean_terminated_length": 1124.166748046875, "completions/min_length": 637.0, "completions/min_terminated_length": 637.0, "epoch": 0.4881275440976934, "frac_reward_zero_std": 1.0, "grad_norm": 8.492477832078293e-08, "kl": 0.0, "learning_rate": 2.6052449965493447e-07, "loss": 0.0, "num_tokens": 46756672.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1439 }, { "completion_length": 873.0833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1403.0, "completions/max_terminated_length": 1403.0, "completions/mean_length": 873.0833740234375, "completions/mean_terminated_length": 873.0833740234375, "completions/min_length": 383.0, "completions/min_terminated_length": 383.0, "epoch": 0.48846675712347354, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.6035196687370597e-07, "loss": 0.0, "num_tokens": 46775345.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1440 }, { "completion_length": 1120.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1756.0, "completions/max_terminated_length": 1756.0, "completions/mean_length": 1120.5833740234375, "completions/mean_terminated_length": 1120.5833740234375, "completions/min_length": 500.0, "completions/min_terminated_length": 500.0, "epoch": 0.48880597014925375, "frac_reward_zero_std": 1.0, "grad_norm": 1.805644558316999e-07, "kl": 0.0, "learning_rate": 2.601794340924776e-07, "loss": 0.0, "num_tokens": 46799766.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1441 }, { "completion_length": 3815.7501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 6544.0, "completions/max_terminated_length": 6544.0, "completions/mean_length": 3815.75, "completions/mean_terminated_length": 3815.75, "completions/min_length": 1958.0, "completions/min_terminated_length": 1958.0, "epoch": 0.4891451831750339, "frac_reward_zero_std": 0.5, "grad_norm": 0.5821070671081543, "kl": 0.0, "learning_rate": 2.6000690131124913e-07, "loss": -0.0008, "num_tokens": 46855389.0, "reward": 0.6499999761581421, "reward_std": 0.17606817185878754, "rewards/correctness_reward_func/mean": 0.3499999940395355, "rewards/correctness_reward_func/std": 0.4358898997306824, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1442 }, { "completion_length": 1550.7500610351562, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 4319.0, "completions/mean_length": 2648.916748046875, "completions/mean_terminated_length": 1860.9000244140625, "completions/min_length": 627.0, "completions/min_terminated_length": 627.0, "epoch": 0.48948439620081413, "frac_reward_zero_std": 0.5, "grad_norm": 0.7266389727592468, "kl": NaN, "learning_rate": 2.598343685300207e-07, "loss": -0.026, "num_tokens": 46890738.0, "reward": 0.9166666865348816, "reward_std": 0.30441197752952576, "rewards/correctness_reward_func/mean": 0.6666666865348816, "rewards/correctness_reward_func/std": 0.4923659861087799, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.11677484214305878, "step": 1443 }, { "completion_length": 963.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1882.0, "completions/max_terminated_length": 1882.0, "completions/mean_length": 963.5833740234375, "completions/mean_terminated_length": 963.5833740234375, "completions/min_length": 421.0, "completions/min_terminated_length": 421.0, "epoch": 0.4898236092265943, "frac_reward_zero_std": 1.0, "grad_norm": 1.0644764358858083e-07, "kl": 0.0, "learning_rate": 2.5966183574879224e-07, "loss": 0.0, "num_tokens": 46916977.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1444 }, { "completion_length": 653.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1193.0, "completions/max_terminated_length": 1193.0, "completions/mean_length": 653.75, "completions/mean_terminated_length": 653.75, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 0.4901628222523745, "frac_reward_zero_std": 0.5, "grad_norm": 0.06554730981588364, "kl": 0.0, "learning_rate": 2.5948930296756384e-07, "loss": 0.0011, "num_tokens": 46937308.0, "reward": 0.7833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1445 }, { "completion_length": 1692.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2739.0, "completions/max_terminated_length": 2739.0, "completions/mean_length": 1692.0, "completions/mean_terminated_length": 1692.0, "completions/min_length": 701.0, "completions/min_terminated_length": 701.0, "epoch": 0.49050203527815467, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.593167701863354e-07, "loss": 0.0, "num_tokens": 46968178.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1446 }, { "completion_length": 900.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 1979.0, "completions/max_terminated_length": 1979.0, "completions/mean_length": 900.0, "completions/mean_terminated_length": 900.0, "completions/min_length": 522.0, "completions/min_terminated_length": 522.0, "epoch": 0.4908412483039349, "frac_reward_zero_std": 1.0, "grad_norm": 1.882286113641385e-07, "kl": 0.0, "learning_rate": 2.5914423740510695e-07, "loss": 0.0, "num_tokens": 46990324.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1447 }, { "completion_length": 1708.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2794.0, "completions/max_terminated_length": 2794.0, "completions/mean_length": 1708.166748046875, "completions/mean_terminated_length": 1708.166748046875, "completions/min_length": 704.0, "completions/min_terminated_length": 704.0, "epoch": 0.49118046132971505, "frac_reward_zero_std": 0.5, "grad_norm": 0.09449905157089233, "kl": 0.0, "learning_rate": 2.589717046238785e-07, "loss": -0.0003, "num_tokens": 47026494.0, "reward": 0.7666666507720947, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1448 }, { "completion_length": 477.41668701171875, "completions/clipped_ratio": 0.0, "completions/max_length": 959.0, "completions/max_terminated_length": 959.0, "completions/mean_length": 477.41668701171875, "completions/mean_terminated_length": 477.41668701171875, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.49151967435549526, "frac_reward_zero_std": 0.5, "grad_norm": 0.4193439781665802, "kl": 0.0, "learning_rate": 2.587991718426501e-07, "loss": 0.005, "num_tokens": 47044481.0, "reward": 0.7041666507720947, "reward_std": 0.23474274575710297, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.5149286985397339, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 1449 }, { "completion_length": 1075.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3197.0, "completions/max_terminated_length": 3197.0, "completions/mean_length": 1075.916748046875, "completions/mean_terminated_length": 1075.916748046875, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 0.4918588873812754, "frac_reward_zero_std": 1.0, "grad_norm": 9.963451219618946e-08, "kl": 0.0, "learning_rate": 2.5862663906142166e-07, "loss": 0.0, "num_tokens": 47072962.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1450 }, { "completion_length": 2795.2501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 4084.0, "completions/max_terminated_length": 4084.0, "completions/mean_length": 2795.25, "completions/mean_terminated_length": 2795.25, "completions/min_length": 1543.0, "completions/min_terminated_length": 1543.0, "epoch": 0.49219810040705564, "frac_reward_zero_std": 0.5, "grad_norm": 0.07675554603338242, "kl": 0.0, "learning_rate": 2.584541062801932e-07, "loss": -0.0011, "num_tokens": 47116405.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1451 }, { "completion_length": 854.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1251.0, "completions/max_terminated_length": 1251.0, "completions/mean_length": 854.6666870117188, "completions/mean_terminated_length": 854.6666870117188, "completions/min_length": 513.0, "completions/min_terminated_length": 513.0, "epoch": 0.4925373134328358, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.5828157349896476e-07, "loss": 0.0, "num_tokens": 47135607.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1452 }, { "completion_length": 3118.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5462.0, "completions/max_terminated_length": 5462.0, "completions/mean_length": 3118.666748046875, "completions/mean_terminated_length": 3118.666748046875, "completions/min_length": 1826.0, "completions/min_terminated_length": 1826.0, "epoch": 0.492876526458616, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.5810904071773637e-07, "loss": 0.0, "num_tokens": 47188067.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1453 }, { "completion_length": 915.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2052.0, "completions/max_terminated_length": 2052.0, "completions/mean_length": 915.9166870117188, "completions/mean_terminated_length": 915.9166870117188, "completions/min_length": 451.0, "completions/min_terminated_length": 451.0, "epoch": 0.4932157394843962, "frac_reward_zero_std": 0.0, "grad_norm": 0.23152890801429749, "kl": 0.0, "learning_rate": 2.57936507936508e-07, "loss": -0.0019, "num_tokens": 47209150.0, "reward": 1.0833333730697632, "reward_std": 0.21807155013084412, "rewards/correctness_reward_func/mean": 0.7833333015441895, "rewards/correctness_reward_func/std": 0.26227444410324097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1454 }, { "completion_length": 749.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1265.0, "completions/max_terminated_length": 1265.0, "completions/mean_length": 749.6666870117188, "completions/mean_terminated_length": 749.6666870117188, "completions/min_length": 510.0, "completions/min_terminated_length": 510.0, "epoch": 0.4935549525101764, "frac_reward_zero_std": 0.5, "grad_norm": 0.07760334759950638, "kl": 0.0, "learning_rate": 2.577639751552795e-07, "loss": 0.0005, "num_tokens": 47231640.0, "reward": 0.7666666507720947, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1455 }, { "completion_length": 586.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1102.0, "completions/max_terminated_length": 1102.0, "completions/mean_length": 586.6666870117188, "completions/mean_terminated_length": 586.6666870117188, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.49389416553595655, "frac_reward_zero_std": 0.5, "grad_norm": 0.07341702282428741, "kl": 0.0, "learning_rate": 2.575914423740511e-07, "loss": -0.0005, "num_tokens": 47248190.0, "reward": 1.2333333492279053, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.9333333373069763, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1456 }, { "completion_length": 1043.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1892.0, "completions/max_terminated_length": 1892.0, "completions/mean_length": 1043.3333740234375, "completions/mean_terminated_length": 1043.3333740234375, "completions/min_length": 638.0, "completions/min_terminated_length": 638.0, "epoch": 0.49423337856173677, "frac_reward_zero_std": 0.5, "grad_norm": 0.08260902017354965, "kl": 0.0, "learning_rate": 2.5741890959282263e-07, "loss": -0.0001, "num_tokens": 47274684.0, "reward": 0.7333333492279053, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.43333330750465393, "rewards/correctness_reward_func/std": 0.45792683959007263, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1457 }, { "completion_length": 881.5000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1414.0, "completions/max_terminated_length": 1414.0, "completions/mean_length": 881.5, "completions/mean_terminated_length": 881.5, "completions/min_length": 469.0, "completions/min_terminated_length": 469.0, "epoch": 0.494572591587517, "frac_reward_zero_std": 1.0, "grad_norm": 8.482059854486579e-08, "kl": 0.0, "learning_rate": 2.572463768115942e-07, "loss": 0.0, "num_tokens": 47295768.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1458 }, { "completion_length": 972.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1890.0, "completions/max_terminated_length": 1890.0, "completions/mean_length": 972.0833740234375, "completions/mean_terminated_length": 972.0833740234375, "completions/min_length": 553.0, "completions/min_terminated_length": 553.0, "epoch": 0.49491180461329715, "frac_reward_zero_std": 0.5, "grad_norm": 0.25359243154525757, "kl": 0.0, "learning_rate": 2.5707384403036574e-07, "loss": -0.0024, "num_tokens": 47322427.0, "reward": 1.133333444595337, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.2806917726993561, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1459 }, { "completion_length": 2337.7501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 5896.0, "completions/max_terminated_length": 5896.0, "completions/mean_length": 2337.75, "completions/mean_terminated_length": 2337.75, "completions/min_length": 569.0, "completions/min_terminated_length": 569.0, "epoch": 0.49525101763907736, "frac_reward_zero_std": 0.5, "grad_norm": 0.11032797396183014, "kl": 0.0, "learning_rate": 2.5690131124913735e-07, "loss": -0.0003, "num_tokens": 47362720.0, "reward": 0.7833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1460 }, { "completion_length": 1546.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3141.0, "completions/max_terminated_length": 3141.0, "completions/mean_length": 1546.75, "completions/mean_terminated_length": 1546.75, "completions/min_length": 411.0, "completions/min_terminated_length": 411.0, "epoch": 0.4955902306648575, "frac_reward_zero_std": 0.0, "grad_norm": 0.7216833829879761, "kl": 0.0, "learning_rate": 2.567287784679089e-07, "loss": -0.0147, "num_tokens": 47392603.0, "reward": 1.1166666746139526, "reward_std": 0.29902368783950806, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.3857302963733673, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1461 }, { "completion_length": 1882.75, "completions/clipped_ratio": 0.0, "completions/max_length": 6034.0, "completions/max_terminated_length": 6034.0, "completions/mean_length": 1882.75, "completions/mean_terminated_length": 1882.75, "completions/min_length": 628.0, "completions/min_terminated_length": 628.0, "epoch": 0.49592944369063774, "frac_reward_zero_std": 0.5, "grad_norm": 0.09218711405992508, "kl": 0.0, "learning_rate": 2.5655624568668045e-07, "loss": 0.0015, "num_tokens": 47427418.0, "reward": 0.7833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1462 }, { "completion_length": 1842.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3106.0, "completions/max_terminated_length": 3106.0, "completions/mean_length": 1842.3333740234375, "completions/mean_terminated_length": 1842.3333740234375, "completions/min_length": 688.0, "completions/min_terminated_length": 688.0, "epoch": 0.4962686567164179, "frac_reward_zero_std": 0.5, "grad_norm": 0.10800613462924957, "kl": 0.0, "learning_rate": 2.56383712905452e-07, "loss": -0.002, "num_tokens": 47460344.0, "reward": 0.7333333492279053, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.43333330750465393, "rewards/correctness_reward_func/std": 0.45792683959007263, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1463 }, { "completion_length": 1950.5001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 4632.0, "completions/max_terminated_length": 4632.0, "completions/mean_length": 1950.5, "completions/mean_terminated_length": 1950.5, "completions/min_length": 474.0, "completions/min_terminated_length": 474.0, "epoch": 0.4966078697421981, "frac_reward_zero_std": 1.0, "grad_norm": 3.213380068700644e-07, "kl": 0.0, "learning_rate": 2.562111801242236e-07, "loss": 0.0, "num_tokens": 47497394.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1464 }, { "completion_length": 769.0833435058594, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 1813.0, "completions/mean_length": 1318.166748046875, "completions/mean_terminated_length": 839.0, "completions/min_length": 432.0, "completions/min_terminated_length": 432.0, "epoch": 0.4969470827679783, "frac_reward_zero_std": 0.5, "grad_norm": 0.1686006784439087, "kl": NaN, "learning_rate": 2.5603864734299516e-07, "loss": -0.0112, "num_tokens": 47522595.0, "reward": 1.0916666984558105, "reward_std": 0.26536136865615845, "rewards/correctness_reward_func/mean": 0.8166666030883789, "rewards/correctness_reward_func/std": 0.27579087018966675, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 1465 }, { "completion_length": 2119.75, "completions/clipped_ratio": 0.0, "completions/max_length": 4227.0, "completions/max_terminated_length": 4227.0, "completions/mean_length": 2119.75, "completions/mean_terminated_length": 2119.75, "completions/min_length": 719.0, "completions/min_terminated_length": 719.0, "epoch": 0.4972862957937585, "frac_reward_zero_std": 1.0, "grad_norm": 1.5450403623162856e-07, "kl": 0.0, "learning_rate": 2.558661145617667e-07, "loss": 0.0, "num_tokens": 47560068.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1466 }, { "completion_length": 1041.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 1585.0, "completions/max_terminated_length": 1585.0, "completions/mean_length": 1041.5, "completions/mean_terminated_length": 1041.5, "completions/min_length": 501.0, "completions/min_terminated_length": 501.0, "epoch": 0.49762550881953865, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.5569358178053827e-07, "loss": 0.0, "num_tokens": 47583528.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1467 }, { "completion_length": 2296.8334350585938, "completions/clipped_ratio": 0.0, "completions/max_length": 3900.0, "completions/max_terminated_length": 3900.0, "completions/mean_length": 2296.83349609375, "completions/mean_terminated_length": 2296.83349609375, "completions/min_length": 934.0, "completions/min_terminated_length": 934.0, "epoch": 0.49796472184531887, "frac_reward_zero_std": 0.5, "grad_norm": 0.7344428896903992, "kl": 0.0, "learning_rate": 2.555210489993099e-07, "loss": -0.0226, "num_tokens": 47621146.0, "reward": 1.0333333015441895, "reward_std": 0.25819891691207886, "rewards/correctness_reward_func/mean": 0.7333333492279053, "rewards/correctness_reward_func/std": 0.44585633277893066, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1468 }, { "completion_length": 959.5833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1718.0, "completions/max_terminated_length": 1718.0, "completions/mean_length": 959.5833740234375, "completions/mean_terminated_length": 959.5833740234375, "completions/min_length": 656.0, "completions/min_terminated_length": 656.0, "epoch": 0.49830393487109903, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.553485162180814e-07, "loss": 0.0, "num_tokens": 47647919.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1469 }, { "completion_length": 1128.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2387.0, "completions/max_terminated_length": 2387.0, "completions/mean_length": 1128.916748046875, "completions/mean_terminated_length": 1128.916748046875, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 0.49864314789687925, "frac_reward_zero_std": 0.5, "grad_norm": 0.0917140319943428, "kl": 0.0, "learning_rate": 2.55175983436853e-07, "loss": -0.0003, "num_tokens": 47674696.0, "reward": 0.75, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.44999995827674866, "rewards/correctness_reward_func/std": 0.47577688097953796, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1470 }, { "completion_length": 1027.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1695.0, "completions/max_terminated_length": 1695.0, "completions/mean_length": 1027.0833740234375, "completions/mean_terminated_length": 1027.0833740234375, "completions/min_length": 771.0, "completions/min_terminated_length": 771.0, "epoch": 0.4989823609226594, "frac_reward_zero_std": 1.0, "grad_norm": 1.875989426025626e-07, "kl": 0.0, "learning_rate": 2.550034506556246e-07, "loss": 0.0, "num_tokens": 47699537.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1471 }, { "completion_length": 1540.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 4005.0, "completions/max_terminated_length": 4005.0, "completions/mean_length": 1540.666748046875, "completions/mean_terminated_length": 1540.666748046875, "completions/min_length": 652.0, "completions/min_terminated_length": 652.0, "epoch": 0.4993215739484396, "frac_reward_zero_std": 0.5, "grad_norm": 0.058931849896907806, "kl": 0.0, "learning_rate": 2.5483091787439614e-07, "loss": 0.0, "num_tokens": 47729119.0, "reward": 0.7666666507720947, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1472 }, { "completion_length": 898.0833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1102.0, "completions/max_terminated_length": 1102.0, "completions/mean_length": 898.0833740234375, "completions/mean_terminated_length": 898.0833740234375, "completions/min_length": 547.0, "completions/min_terminated_length": 547.0, "epoch": 0.4996607869742198, "frac_reward_zero_std": 1.0, "grad_norm": 1.5320672730467777e-07, "kl": 0.0, "learning_rate": 2.546583850931677e-07, "loss": 0.0, "num_tokens": 47751914.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1473 }, { "completion_length": 2586.0001220703125, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5234.0, "completions/mean_length": 3135.08349609375, "completions/mean_terminated_length": 2821.091064453125, "completions/min_length": 1674.0, "completions/min_terminated_length": 1674.0, "epoch": 0.5, "frac_reward_zero_std": 0.5, "grad_norm": 0.6311071515083313, "kl": NaN, "learning_rate": 2.5448585231193924e-07, "loss": -0.0177, "num_tokens": 47792816.0, "reward": 0.4750000238418579, "reward_std": 0.2524876296520233, "rewards/correctness_reward_func/mean": 0.20000000298023224, "rewards/correctness_reward_func/std": 0.36181363463401794, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 1474 }, { "completion_length": 2092.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4293.0, "completions/max_terminated_length": 4293.0, "completions/mean_length": 2092.0, "completions/mean_terminated_length": 2092.0, "completions/min_length": 802.0, "completions/min_terminated_length": 802.0, "epoch": 0.5003392130257802, "frac_reward_zero_std": 0.5, "grad_norm": 0.10809829086065292, "kl": 0.0, "learning_rate": 2.5431331953071085e-07, "loss": -0.005, "num_tokens": 47826866.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1475 }, { "completion_length": 1670.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2998.0, "completions/max_terminated_length": 2998.0, "completions/mean_length": 1670.5833740234375, "completions/mean_terminated_length": 1670.5833740234375, "completions/min_length": 1068.0, "completions/min_terminated_length": 1068.0, "epoch": 0.5006784260515604, "frac_reward_zero_std": 0.5, "grad_norm": 0.11652311682701111, "kl": 0.0, "learning_rate": 2.541407867494824e-07, "loss": -0.0018, "num_tokens": 47858949.0, "reward": 0.75, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.44999995827674866, "rewards/correctness_reward_func/std": 0.47577688097953796, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1476 }, { "completion_length": 1235.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2558.0, "completions/max_terminated_length": 2558.0, "completions/mean_length": 1235.75, "completions/mean_terminated_length": 1235.75, "completions/min_length": 601.0, "completions/min_terminated_length": 601.0, "epoch": 0.5010176390773405, "frac_reward_zero_std": 1.0, "grad_norm": 1.9835449904803681e-07, "kl": 0.0, "learning_rate": 2.5396825396825396e-07, "loss": 0.0, "num_tokens": 47887158.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1477 }, { "completion_length": 1437.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2914.0, "completions/max_terminated_length": 2914.0, "completions/mean_length": 1437.3333740234375, "completions/mean_terminated_length": 1437.3333740234375, "completions/min_length": 706.0, "completions/min_terminated_length": 706.0, "epoch": 0.5013568521031208, "frac_reward_zero_std": 0.5, "grad_norm": 0.34220340847969055, "kl": 0.0, "learning_rate": 2.537957211870255e-07, "loss": -0.0006, "num_tokens": 47913442.0, "reward": 1.1833332777023315, "reward_std": 0.19407901167869568, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1478 }, { "completion_length": 1364.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2213.0, "completions/max_terminated_length": 2213.0, "completions/mean_length": 1364.5, "completions/mean_terminated_length": 1364.5, "completions/min_length": 565.0, "completions/min_terminated_length": 565.0, "epoch": 0.501696065128901, "frac_reward_zero_std": 1.0, "grad_norm": 1.209706965710211e-07, "kl": 0.0, "learning_rate": 2.536231884057971e-07, "loss": 0.0, "num_tokens": 47938462.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1479 }, { "completion_length": 2174.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4637.0, "completions/max_terminated_length": 4637.0, "completions/mean_length": 2174.416748046875, "completions/mean_terminated_length": 2174.416748046875, "completions/min_length": 458.0, "completions/min_terminated_length": 458.0, "epoch": 0.5020352781546812, "frac_reward_zero_std": 1.0, "grad_norm": 1.708511376818933e-07, "kl": 0.0, "learning_rate": 2.534506556245686e-07, "loss": 0.0, "num_tokens": 47980707.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1480 }, { "completion_length": 1734.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 4259.0, "completions/max_terminated_length": 4259.0, "completions/mean_length": 1734.5, "completions/mean_terminated_length": 1734.5, "completions/min_length": 490.0, "completions/min_terminated_length": 490.0, "epoch": 0.5023744911804613, "frac_reward_zero_std": 0.5, "grad_norm": 0.03856414929032326, "kl": 0.0, "learning_rate": 2.532781228433402e-07, "loss": -0.0, "num_tokens": 48012927.0, "reward": 0.7833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1481 }, { "completion_length": 2017.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3410.0, "completions/max_terminated_length": 3410.0, "completions/mean_length": 2017.5, "completions/mean_terminated_length": 2017.5, "completions/min_length": 973.0, "completions/min_terminated_length": 973.0, "epoch": 0.5027137042062415, "frac_reward_zero_std": 0.5, "grad_norm": 0.09233441948890686, "kl": 0.0, "learning_rate": 2.5310559006211177e-07, "loss": -0.001, "num_tokens": 48050277.0, "reward": 1.1666667461395264, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1482 }, { "completion_length": 2162.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 5024.0, "completions/max_terminated_length": 5024.0, "completions/mean_length": 2162.25, "completions/mean_terminated_length": 2162.25, "completions/min_length": 560.0, "completions/min_terminated_length": 560.0, "epoch": 0.5030529172320217, "frac_reward_zero_std": 0.5, "grad_norm": 0.0756925418972969, "kl": 0.0, "learning_rate": 2.529330572808834e-07, "loss": -0.0001, "num_tokens": 48088770.0, "reward": 0.7166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1483 }, { "completion_length": 739.5833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1158.0, "completions/max_terminated_length": 1158.0, "completions/mean_length": 739.5833740234375, "completions/mean_terminated_length": 739.5833740234375, "completions/min_length": 449.0, "completions/min_terminated_length": 449.0, "epoch": 0.5033921302578019, "frac_reward_zero_std": 0.5, "grad_norm": 0.05837513878941536, "kl": 0.0, "learning_rate": 2.527605244996549e-07, "loss": -0.0009, "num_tokens": 48110065.0, "reward": 0.7833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1484 }, { "completion_length": 1982.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2790.0, "completions/max_terminated_length": 2790.0, "completions/mean_length": 1982.5833740234375, "completions/mean_terminated_length": 1982.5833740234375, "completions/min_length": 1539.0, "completions/min_terminated_length": 1539.0, "epoch": 0.503731343283582, "frac_reward_zero_std": 0.0, "grad_norm": 0.12157196551561356, "kl": 0.0, "learning_rate": 2.525879917184265e-07, "loss": -0.0024, "num_tokens": 48146594.0, "reward": 1.2166666984558105, "reward_std": 0.09246456623077393, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1485 }, { "completion_length": 700.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1093.0, "completions/max_terminated_length": 1093.0, "completions/mean_length": 700.4166870117188, "completions/mean_terminated_length": 700.4166870117188, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.5040705563093623, "frac_reward_zero_std": 1.0, "grad_norm": 1.1596237925459718e-07, "kl": 0.0, "learning_rate": 2.524154589371981e-07, "loss": 0.0, "num_tokens": 48167503.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1486 }, { "completion_length": 1190.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1643.0, "completions/max_terminated_length": 1643.0, "completions/mean_length": 1190.8333740234375, "completions/mean_terminated_length": 1190.8333740234375, "completions/min_length": 710.0, "completions/min_terminated_length": 710.0, "epoch": 0.5044097693351425, "frac_reward_zero_std": 0.5, "grad_norm": 0.07946118712425232, "kl": 0.0, "learning_rate": 2.5224292615596964e-07, "loss": 0.0008, "num_tokens": 48194897.0, "reward": 1.1666667461395264, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1487 }, { "completion_length": 1068.0833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 2684.0, "completions/max_terminated_length": 2684.0, "completions/mean_length": 1068.0833740234375, "completions/mean_terminated_length": 1068.0833740234375, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 0.5047489823609227, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.520703933747412e-07, "loss": 0.0, "num_tokens": 48219156.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1488 }, { "completion_length": 696.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1037.0, "completions/max_terminated_length": 1037.0, "completions/mean_length": 696.3333740234375, "completions/mean_terminated_length": 696.3333740234375, "completions/min_length": 432.0, "completions/min_terminated_length": 432.0, "epoch": 0.5050881953867028, "frac_reward_zero_std": 0.5, "grad_norm": 0.08128704875707626, "kl": 0.0, "learning_rate": 2.5189786059351275e-07, "loss": -0.0013, "num_tokens": 48236008.0, "reward": 1.1500000953674316, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.8500000834465027, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1489 }, { "completion_length": 3478.166748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6487.0, "completions/mean_length": 4027.25, "completions/mean_terminated_length": 3794.36376953125, "completions/min_length": 1679.0, "completions/min_terminated_length": 1679.0, "epoch": 0.505427408412483, "frac_reward_zero_std": 0.5, "grad_norm": 0.09632746875286102, "kl": NaN, "learning_rate": 2.5172532781228435e-07, "loss": -0.0154, "num_tokens": 48288048.0, "reward": 0.27500003576278687, "reward_std": 0.06123724579811096, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 1490 }, { "completion_length": 828.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1293.0, "completions/max_terminated_length": 1293.0, "completions/mean_length": 828.1666870117188, "completions/mean_terminated_length": 828.1666870117188, "completions/min_length": 453.0, "completions/min_terminated_length": 453.0, "epoch": 0.5057666214382632, "frac_reward_zero_std": 0.5, "grad_norm": 0.05527422949671745, "kl": 0.0, "learning_rate": 2.5155279503105585e-07, "loss": -0.0009, "num_tokens": 48310556.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1491 }, { "completion_length": 4042.5001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 6168.0, "completions/max_terminated_length": 6168.0, "completions/mean_length": 4042.5, "completions/mean_terminated_length": 4042.5, "completions/min_length": 1814.0, "completions/min_terminated_length": 1814.0, "epoch": 0.5061058344640434, "frac_reward_zero_std": 1.0, "grad_norm": 3.171755906805629e-07, "kl": 0.0, "learning_rate": 2.5138026224982746e-07, "loss": 0.0, "num_tokens": 48371186.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1492 }, { "completion_length": 873.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1080.0, "completions/max_terminated_length": 1080.0, "completions/mean_length": 873.8333740234375, "completions/mean_terminated_length": 873.8333740234375, "completions/min_length": 654.0, "completions/min_terminated_length": 654.0, "epoch": 0.5064450474898237, "frac_reward_zero_std": 0.0, "grad_norm": 0.3072753846645355, "kl": 0.0, "learning_rate": 2.51207729468599e-07, "loss": -0.0003, "num_tokens": 48396882.0, "reward": 1.2000000476837158, "reward_std": 0.24494895339012146, "rewards/correctness_reward_func/mean": 0.9000000357627869, "rewards/correctness_reward_func/std": 0.28919950127601624, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1493 }, { "completion_length": 480.83335876464844, "completions/clipped_ratio": 0.0, "completions/max_length": 805.0, "completions/max_terminated_length": 805.0, "completions/mean_length": 480.8333435058594, "completions/mean_terminated_length": 480.8333435058594, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 0.5067842605156038, "frac_reward_zero_std": 0.5, "grad_norm": 0.3365095257759094, "kl": 0.0, "learning_rate": 2.510351966873706e-07, "loss": -0.0016, "num_tokens": 48414688.0, "reward": 1.2041666507720947, "reward_std": 0.23474276065826416, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 1494 }, { "completion_length": 628.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1454.0, "completions/max_terminated_length": 1454.0, "completions/mean_length": 628.75, "completions/mean_terminated_length": 628.75, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 0.507123473541384, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.508626639061421e-07, "loss": 0.0, "num_tokens": 48432421.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1495 }, { "completion_length": 875.2500305175781, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 2395.0, "completions/mean_length": 1424.3333740234375, "completions/mean_terminated_length": 954.8182373046875, "completions/min_length": 437.0, "completions/min_terminated_length": 437.0, "epoch": 0.5074626865671642, "frac_reward_zero_std": 0.5, "grad_norm": 0.19508016109466553, "kl": NaN, "learning_rate": 2.506901311249137e-07, "loss": -0.0191, "num_tokens": 48459280.0, "reward": 1.125, "reward_std": 0.23611438274383545, "rewards/correctness_reward_func/mean": 0.8499999642372131, "rewards/correctness_reward_func/std": 0.28444522619247437, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 1496 }, { "completion_length": 1999.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3607.0, "completions/max_terminated_length": 3607.0, "completions/mean_length": 1999.0833740234375, "completions/mean_terminated_length": 1999.0833740234375, "completions/min_length": 640.0, "completions/min_terminated_length": 640.0, "epoch": 0.5078018995929444, "frac_reward_zero_std": 1.0, "grad_norm": 2.0572075243308063e-07, "kl": 0.0, "learning_rate": 2.505175983436853e-07, "loss": 0.0, "num_tokens": 48494189.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1497 }, { "completion_length": 1550.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 2902.0, "completions/max_terminated_length": 2902.0, "completions/mean_length": 1550.166748046875, "completions/mean_terminated_length": 1550.166748046875, "completions/min_length": 543.0, "completions/min_terminated_length": 543.0, "epoch": 0.5081411126187245, "frac_reward_zero_std": 0.0, "grad_norm": 0.08826068788766861, "kl": 0.0, "learning_rate": 2.503450655624569e-07, "loss": 0.0004, "num_tokens": 48523879.0, "reward": 0.7041667699813843, "reward_std": 0.07144345343112946, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 1498 }, { "completion_length": 1549.416748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 3293.0, "completions/mean_length": 2098.5, "completions/mean_terminated_length": 1690.2728271484375, "completions/min_length": 578.0, "completions/min_terminated_length": 578.0, "epoch": 0.5084803256445047, "frac_reward_zero_std": 0.5, "grad_norm": 0.2377554327249527, "kl": NaN, "learning_rate": 2.501725327812284e-07, "loss": -0.0323, "num_tokens": 48549882.0, "reward": 0.6916667819023132, "reward_std": 0.26536139845848083, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.5149286985397339, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 1499 }, { "completion_length": 1704.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3541.0, "completions/max_terminated_length": 3541.0, "completions/mean_length": 1704.5833740234375, "completions/mean_terminated_length": 1704.5833740234375, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "epoch": 0.508819538670285, "frac_reward_zero_std": 1.0, "grad_norm": 6.911277949939176e-08, "kl": 0.0, "learning_rate": 2.5e-07, "loss": 0.0, "num_tokens": 48580555.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1500 }, { "completion_length": 2207.2501220703125, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5553.0, "completions/mean_length": 2756.33349609375, "completions/mean_terminated_length": 2407.9091796875, "completions/min_length": 481.0, "completions/min_terminated_length": 481.0, "epoch": 0.5091587516960652, "frac_reward_zero_std": 0.0, "grad_norm": 0.35127484798431396, "kl": NaN, "learning_rate": 2.4982746721877154e-07, "loss": -0.0401, "num_tokens": 48620044.0, "reward": 0.6750000715255737, "reward_std": 0.38783591985702515, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 1501 }, { "completion_length": 738.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1182.0, "completions/max_terminated_length": 1182.0, "completions/mean_length": 738.5, "completions/mean_terminated_length": 738.5, "completions/min_length": 474.0, "completions/min_terminated_length": 474.0, "epoch": 0.5094979647218453, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.4965493443754315e-07, "loss": 0.0, "num_tokens": 48642352.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1502 }, { "completion_length": 727.5833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1062.0, "completions/max_terminated_length": 1062.0, "completions/mean_length": 727.5833740234375, "completions/mean_terminated_length": 727.5833740234375, "completions/min_length": 511.0, "completions/min_terminated_length": 511.0, "epoch": 0.5098371777476255, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.494824016563147e-07, "loss": 0.0, "num_tokens": 48659741.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1503 }, { "completion_length": 2378.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 5596.0, "completions/max_terminated_length": 5596.0, "completions/mean_length": 2378.666748046875, "completions/mean_terminated_length": 2378.666748046875, "completions/min_length": 675.0, "completions/min_terminated_length": 675.0, "epoch": 0.5101763907734057, "frac_reward_zero_std": 0.5, "grad_norm": 0.4364568293094635, "kl": 0.0, "learning_rate": 2.4930986887508625e-07, "loss": -0.0052, "num_tokens": 48700291.0, "reward": 0.6333333253860474, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.3333333432674408, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1504 }, { "completion_length": 874.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1667.0, "completions/max_terminated_length": 1667.0, "completions/mean_length": 874.4166870117188, "completions/mean_terminated_length": 874.4166870117188, "completions/min_length": 441.0, "completions/min_terminated_length": 441.0, "epoch": 0.5105156037991859, "frac_reward_zero_std": 0.0, "grad_norm": 0.11074087023735046, "kl": 0.0, "learning_rate": 2.491373360938578e-07, "loss": 0.0004, "num_tokens": 48721386.0, "reward": 1.1666667461395264, "reward_std": 0.09559705853462219, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1505 }, { "completion_length": 1689.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3274.0, "completions/max_terminated_length": 3274.0, "completions/mean_length": 1689.25, "completions/mean_terminated_length": 1689.25, "completions/min_length": 631.0, "completions/min_terminated_length": 631.0, "epoch": 0.510854816824966, "frac_reward_zero_std": 0.5, "grad_norm": 0.48789387941360474, "kl": 0.0, "learning_rate": 2.489648033126294e-07, "loss": 0.0089, "num_tokens": 48755481.0, "reward": 0.9833334684371948, "reward_std": 0.2228602170944214, "rewards/correctness_reward_func/mean": 0.6833333373069763, "rewards/correctness_reward_func/std": 0.32427072525024414, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1506 }, { "completion_length": 1750.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3116.0, "completions/max_terminated_length": 3116.0, "completions/mean_length": 1750.416748046875, "completions/mean_terminated_length": 1750.416748046875, "completions/min_length": 452.0, "completions/min_terminated_length": 452.0, "epoch": 0.5111940298507462, "frac_reward_zero_std": 0.5, "grad_norm": 0.13684289157390594, "kl": 0.0, "learning_rate": 2.4879227053140096e-07, "loss": -0.0007, "num_tokens": 48789542.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1507 }, { "completion_length": 1545.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3369.0, "completions/max_terminated_length": 3369.0, "completions/mean_length": 1545.0, "completions/mean_terminated_length": 1545.0, "completions/min_length": 471.0, "completions/min_terminated_length": 471.0, "epoch": 0.5115332428765265, "frac_reward_zero_std": 0.5, "grad_norm": 0.09492339193820953, "kl": 0.0, "learning_rate": 2.486197377501725e-07, "loss": -0.0017, "num_tokens": 48822164.0, "reward": 1.2166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298573225736618, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1508 }, { "completion_length": 1833.0001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 3719.0, "completions/max_terminated_length": 3719.0, "completions/mean_length": 1833.0, "completions/mean_terminated_length": 1833.0, "completions/min_length": 567.0, "completions/min_terminated_length": 567.0, "epoch": 0.5118724559023067, "frac_reward_zero_std": 0.5, "grad_norm": 0.6496280431747437, "kl": 0.0, "learning_rate": 2.4844720496894407e-07, "loss": 0.0239, "num_tokens": 48855794.0, "reward": 1.0166666507720947, "reward_std": 0.24013885855674744, "rewards/correctness_reward_func/mean": 0.7166666984558105, "rewards/correctness_reward_func/std": 0.4386618733406067, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1509 }, { "completion_length": 1598.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3362.0, "completions/max_terminated_length": 3362.0, "completions/mean_length": 1598.916748046875, "completions/mean_terminated_length": 1598.916748046875, "completions/min_length": 694.0, "completions/min_terminated_length": 694.0, "epoch": 0.5122116689280869, "frac_reward_zero_std": 0.0, "grad_norm": 0.7508085370063782, "kl": 0.0, "learning_rate": 2.482746721877157e-07, "loss": 0.0096, "num_tokens": 48887977.0, "reward": 0.8666666746139526, "reward_std": 0.5221890211105347, "rewards/correctness_reward_func/mean": 0.5666666626930237, "rewards/correctness_reward_func/std": 0.5033223032951355, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1510 }, { "completion_length": 2299.2501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 4102.0, "completions/max_terminated_length": 4102.0, "completions/mean_length": 2299.25, "completions/mean_terminated_length": 2299.25, "completions/min_length": 776.0, "completions/min_terminated_length": 776.0, "epoch": 0.512550881953867, "frac_reward_zero_std": 0.5, "grad_norm": 0.0988108366727829, "kl": 0.0, "learning_rate": 2.4810213940648723e-07, "loss": -0.0001, "num_tokens": 48927370.0, "reward": 1.2333333492279053, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.9333333373069763, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1511 }, { "completion_length": 1637.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 4044.0, "completions/max_terminated_length": 4044.0, "completions/mean_length": 1637.416748046875, "completions/mean_terminated_length": 1637.416748046875, "completions/min_length": 478.0, "completions/min_terminated_length": 478.0, "epoch": 0.5128900949796472, "frac_reward_zero_std": 0.0, "grad_norm": 0.12136387825012207, "kl": 0.0, "learning_rate": 2.479296066252588e-07, "loss": 0.0007, "num_tokens": 48957027.0, "reward": 0.6666667461395264, "reward_std": 0.07955466210842133, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.07385490089654922, "step": 1512 }, { "completion_length": 1638.7500610351562, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4799.0, "completions/mean_length": 2187.83349609375, "completions/mean_terminated_length": 1787.727294921875, "completions/min_length": 416.0, "completions/min_terminated_length": 416.0, "epoch": 0.5132293080054274, "frac_reward_zero_std": 0.5, "grad_norm": 0.7237550020217896, "kl": NaN, "learning_rate": 2.4775707384403033e-07, "loss": -0.0314, "num_tokens": 48989424.0, "reward": 1.024999976158142, "reward_std": 0.3061861991882324, "rewards/correctness_reward_func/mean": 0.75, "rewards/correctness_reward_func/std": 0.45226702094078064, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 1513 }, { "completion_length": 1524.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 5482.0, "completions/max_terminated_length": 5482.0, "completions/mean_length": 1524.666748046875, "completions/mean_terminated_length": 1524.666748046875, "completions/min_length": 722.0, "completions/min_terminated_length": 722.0, "epoch": 0.5135685210312076, "frac_reward_zero_std": 0.0, "grad_norm": 0.14087025821208954, "kl": 0.0, "learning_rate": 2.475845410628019e-07, "loss": 0.0057, "num_tokens": 49019564.0, "reward": 1.2166666984558105, "reward_std": 0.10641198605298996, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1514 }, { "completion_length": 708.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1138.0, "completions/max_terminated_length": 1138.0, "completions/mean_length": 708.1666870117188, "completions/mean_terminated_length": 708.1666870117188, "completions/min_length": 424.0, "completions/min_terminated_length": 424.0, "epoch": 0.5139077340569878, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.474120082815735e-07, "loss": 0.0, "num_tokens": 49035598.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1515 }, { "completion_length": 1700.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3581.0, "completions/max_terminated_length": 3581.0, "completions/mean_length": 1700.5, "completions/mean_terminated_length": 1700.5, "completions/min_length": 723.0, "completions/min_terminated_length": 723.0, "epoch": 0.514246947082768, "frac_reward_zero_std": 1.0, "grad_norm": 1.0136165684571097e-07, "kl": 0.0, "learning_rate": 2.4723947550034505e-07, "loss": 0.0, "num_tokens": 49065706.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1516 }, { "completion_length": 1448.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2963.0, "completions/max_terminated_length": 2963.0, "completions/mean_length": 1448.75, "completions/mean_terminated_length": 1448.75, "completions/min_length": 759.0, "completions/min_terminated_length": 759.0, "epoch": 0.5145861601085482, "frac_reward_zero_std": 0.5, "grad_norm": 0.11275298148393631, "kl": 0.0, "learning_rate": 2.4706694271911665e-07, "loss": -0.0009, "num_tokens": 49097821.0, "reward": 1.133333444595337, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1517 }, { "completion_length": 744.5000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1298.0, "completions/max_terminated_length": 1298.0, "completions/mean_length": 744.5, "completions/mean_terminated_length": 744.5, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "epoch": 0.5149253731343284, "frac_reward_zero_std": 1.0, "grad_norm": 8.817777796821247e-08, "kl": 0.0, "learning_rate": 2.468944099378882e-07, "loss": 0.0, "num_tokens": 49118197.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1518 }, { "completion_length": 2520.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5057.0, "completions/max_terminated_length": 5057.0, "completions/mean_length": 2520.916748046875, "completions/mean_terminated_length": 2520.916748046875, "completions/min_length": 1217.0, "completions/min_terminated_length": 1217.0, "epoch": 0.5152645861601085, "frac_reward_zero_std": 0.5, "grad_norm": 0.7658690214157104, "kl": 0.0, "learning_rate": 2.4672187715665976e-07, "loss": 0.0126, "num_tokens": 49158396.0, "reward": 1.0499999523162842, "reward_std": 0.27386125922203064, "rewards/correctness_reward_func/mean": 0.75, "rewards/correctness_reward_func/std": 0.45226702094078064, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1519 }, { "completion_length": 1721.3333740234375, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 5437.0, "completions/mean_length": 2819.5, "completions/mean_terminated_length": 2065.60009765625, "completions/min_length": 532.0, "completions/min_terminated_length": 532.0, "epoch": 0.5156037991858887, "frac_reward_zero_std": 0.5, "grad_norm": 0.2232201099395752, "kl": NaN, "learning_rate": 2.465493443754313e-07, "loss": -0.021, "num_tokens": 49190110.0, "reward": 0.6500000953674316, "reward_std": 0.0774596780538559, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.11677484214305878, "step": 1520 }, { "completion_length": 1508.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2760.0, "completions/max_terminated_length": 2760.0, "completions/mean_length": 1508.5, "completions/mean_terminated_length": 1508.5, "completions/min_length": 580.0, "completions/min_terminated_length": 580.0, "epoch": 0.5159430122116689, "frac_reward_zero_std": 0.5, "grad_norm": 0.45441731810569763, "kl": 0.0, "learning_rate": 2.463768115942029e-07, "loss": -0.0076, "num_tokens": 49218514.0, "reward": 0.38333332538604736, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.0833333358168602, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1521 }, { "completion_length": 992.0000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1885.0, "completions/max_terminated_length": 1885.0, "completions/mean_length": 992.0, "completions/mean_terminated_length": 992.0, "completions/min_length": 499.0, "completions/min_terminated_length": 499.0, "epoch": 0.5162822252374492, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.4620427881297447e-07, "loss": 0.0, "num_tokens": 49241296.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1522 }, { "completion_length": 845.0000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1464.0, "completions/max_terminated_length": 1464.0, "completions/mean_length": 845.0, "completions/mean_terminated_length": 845.0, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "epoch": 0.5166214382632293, "frac_reward_zero_std": 0.0, "grad_norm": 0.09237980097532272, "kl": 0.0, "learning_rate": 2.46031746031746e-07, "loss": 0.0008, "num_tokens": 49262362.0, "reward": 1.2000000476837158, "reward_std": 0.08164963126182556, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1523 }, { "completion_length": 1423.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3395.0, "completions/max_terminated_length": 3395.0, "completions/mean_length": 1423.25, "completions/mean_terminated_length": 1423.25, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 0.5169606512890095, "frac_reward_zero_std": 0.5, "grad_norm": 0.11808612942695618, "kl": 0.0, "learning_rate": 2.458592132505176e-07, "loss": 0.0, "num_tokens": 49292215.0, "reward": 1.25, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1524 }, { "completion_length": 2337.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 5480.0, "completions/max_terminated_length": 5480.0, "completions/mean_length": 2337.08349609375, "completions/mean_terminated_length": 2337.08349609375, "completions/min_length": 515.0, "completions/min_terminated_length": 515.0, "epoch": 0.5172998643147897, "frac_reward_zero_std": 1.0, "grad_norm": 1.045852116021706e-07, "kl": 0.0, "learning_rate": 2.4568668046928913e-07, "loss": 0.0, "num_tokens": 49330988.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1525 }, { "completion_length": 906.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1822.0, "completions/max_terminated_length": 1822.0, "completions/mean_length": 906.4166870117188, "completions/mean_terminated_length": 906.4166870117188, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 0.5176390773405699, "frac_reward_zero_std": 0.5, "grad_norm": 0.07079682499170303, "kl": 0.0, "learning_rate": 2.4551414768806073e-07, "loss": -0.0005, "num_tokens": 49353919.0, "reward": 1.2666666507720947, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1526 }, { "completion_length": 1366.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2183.0, "completions/max_terminated_length": 2183.0, "completions/mean_length": 1366.8333740234375, "completions/mean_terminated_length": 1366.8333740234375, "completions/min_length": 457.0, "completions/min_terminated_length": 457.0, "epoch": 0.51797829036635, "frac_reward_zero_std": 0.5, "grad_norm": 0.5180139541625977, "kl": 0.0, "learning_rate": 2.453416149068323e-07, "loss": 0.0024, "num_tokens": 49383263.0, "reward": 0.833333432674408, "reward_std": 0.20655912160873413, "rewards/correctness_reward_func/mean": 0.5333333611488342, "rewards/correctness_reward_func/std": 0.3938928246498108, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1527 }, { "completion_length": 1335.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2015.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 1335.416748046875, "completions/mean_terminated_length": 1335.416748046875, "completions/min_length": 606.0, "completions/min_terminated_length": 606.0, "epoch": 0.5183175033921302, "frac_reward_zero_std": 0.0, "grad_norm": 0.3672662675380707, "kl": 0.0, "learning_rate": 2.4516908212560384e-07, "loss": 0.0052, "num_tokens": 49415746.0, "reward": 1.1041667461395264, "reward_std": 0.20917050540447235, "rewards/correctness_reward_func/mean": 0.8166666030883789, "rewards/correctness_reward_func/std": 0.27579087018966675, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 1528 }, { "completion_length": 1882.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 4725.0, "completions/max_terminated_length": 4725.0, "completions/mean_length": 1882.25, "completions/mean_terminated_length": 1882.25, "completions/min_length": 728.0, "completions/min_terminated_length": 728.0, "epoch": 0.5186567164179104, "frac_reward_zero_std": 0.0, "grad_norm": 0.6903595328330994, "kl": 0.0, "learning_rate": 2.449965493443754e-07, "loss": 0.0405, "num_tokens": 49451323.0, "reward": 1.0499999523162842, "reward_std": 0.36742347478866577, "rewards/correctness_reward_func/mean": 0.75, "rewards/correctness_reward_func/std": 0.36306774616241455, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1529 }, { "completion_length": 1115.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2328.0, "completions/max_terminated_length": 2328.0, "completions/mean_length": 1115.666748046875, "completions/mean_terminated_length": 1115.666748046875, "completions/min_length": 669.0, "completions/min_terminated_length": 669.0, "epoch": 0.5189959294436907, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.44824016563147e-07, "loss": 0.0, "num_tokens": 49476543.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1530 }, { "completion_length": 1913.75, "completions/clipped_ratio": 0.0, "completions/max_length": 4482.0, "completions/max_terminated_length": 4482.0, "completions/mean_length": 1913.75, "completions/mean_terminated_length": 1913.75, "completions/min_length": 620.0, "completions/min_terminated_length": 620.0, "epoch": 0.5193351424694709, "frac_reward_zero_std": 0.0, "grad_norm": 0.6608690619468689, "kl": 0.0, "learning_rate": 2.4465148378191855e-07, "loss": -0.0271, "num_tokens": 49508046.0, "reward": 0.9500000476837158, "reward_std": 0.27386125922203064, "rewards/correctness_reward_func/mean": 0.6499999761581421, "rewards/correctness_reward_func/std": 0.40113475918769836, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1531 }, { "completion_length": 1341.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 2495.0, "completions/max_terminated_length": 2495.0, "completions/mean_length": 1341.25, "completions/mean_terminated_length": 1341.25, "completions/min_length": 516.0, "completions/min_terminated_length": 516.0, "epoch": 0.519674355495251, "frac_reward_zero_std": 0.0, "grad_norm": 0.1088586375117302, "kl": 0.0, "learning_rate": 2.4447895100069016e-07, "loss": -0.0016, "num_tokens": 49537125.0, "reward": 0.7083333730697632, "reward_std": 0.09036960452795029, "rewards/correctness_reward_func/mean": 0.43333330750465393, "rewards/correctness_reward_func/std": 0.45792683959007263, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 1532 }, { "completion_length": 1030.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1764.0, "completions/max_terminated_length": 1764.0, "completions/mean_length": 1030.166748046875, "completions/mean_terminated_length": 1030.166748046875, "completions/min_length": 404.0, "completions/min_terminated_length": 404.0, "epoch": 0.5200135685210312, "frac_reward_zero_std": 1.0, "grad_norm": 9.859262917188971e-08, "kl": 0.0, "learning_rate": 2.443064182194617e-07, "loss": 0.0, "num_tokens": 49561361.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1533 }, { "completion_length": 2391.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3764.0, "completions/max_terminated_length": 3764.0, "completions/mean_length": 2391.666748046875, "completions/mean_terminated_length": 2391.666748046875, "completions/min_length": 1171.0, "completions/min_terminated_length": 1171.0, "epoch": 0.5203527815468114, "frac_reward_zero_std": 0.5, "grad_norm": 0.3696172833442688, "kl": 0.0, "learning_rate": 2.4413388543823326e-07, "loss": -0.0039, "num_tokens": 49603753.0, "reward": 0.6333333253860474, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.3333333432674408, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1534 }, { "completion_length": 1300.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2397.0, "completions/max_terminated_length": 2397.0, "completions/mean_length": 1300.166748046875, "completions/mean_terminated_length": 1300.166748046875, "completions/min_length": 602.0, "completions/min_terminated_length": 602.0, "epoch": 0.5206919945725916, "frac_reward_zero_std": 1.0, "grad_norm": 1.0094379376823781e-07, "kl": 0.0, "learning_rate": 2.439613526570048e-07, "loss": 0.0, "num_tokens": 49632591.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1535 }, { "completion_length": 1826.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4222.0, "completions/max_terminated_length": 4222.0, "completions/mean_length": 1826.3333740234375, "completions/mean_terminated_length": 1826.3333740234375, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.5210312075983717, "frac_reward_zero_std": 0.0, "grad_norm": 0.14351871609687805, "kl": 0.0, "learning_rate": 2.4378881987577637e-07, "loss": -0.0009, "num_tokens": 49660957.0, "reward": 1.2333333492279053, "reward_std": 0.10327951610088348, "rewards/correctness_reward_func/mean": 0.9333333373069763, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1536 }, { "completion_length": 2376.166748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6328.0, "completions/mean_length": 2925.25, "completions/mean_terminated_length": 2592.181884765625, "completions/min_length": 511.0, "completions/min_terminated_length": 511.0, "epoch": 0.521370420624152, "frac_reward_zero_std": 0.5, "grad_norm": 0.1891297549009323, "kl": NaN, "learning_rate": 2.4361628709454797e-07, "loss": -0.0156, "num_tokens": 49705035.0, "reward": 0.27500003576278687, "reward_std": 0.06123724579811096, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 1537 }, { "completion_length": 1144.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3013.0, "completions/max_terminated_length": 3013.0, "completions/mean_length": 1144.75, "completions/mean_terminated_length": 1144.75, "completions/min_length": 424.0, "completions/min_terminated_length": 424.0, "epoch": 0.5217096336499322, "frac_reward_zero_std": 0.5, "grad_norm": 0.09957168996334076, "kl": 0.0, "learning_rate": 2.434437543133195e-07, "loss": 0.0005, "num_tokens": 49729620.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1538 }, { "completion_length": 1906.416748046875, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 6426.0, "completions/mean_length": 3553.666748046875, "completions/mean_terminated_length": 2541.888916015625, "completions/min_length": 806.0, "completions/min_terminated_length": 806.0, "epoch": 0.5220488466757124, "frac_reward_zero_std": 0.5, "grad_norm": 0.8302173614501953, "kl": NaN, "learning_rate": 2.432712215320911e-07, "loss": -0.0683, "num_tokens": 49763123.0, "reward": 0.6916667819023132, "reward_std": 0.21311190724372864, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.2250000238418579, "rewards/format_reward_func/std": 0.13568010926246643, "step": 1539 }, { "completion_length": 783.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1439.0, "completions/max_terminated_length": 1439.0, "completions/mean_length": 783.9166870117188, "completions/mean_terminated_length": 783.9166870117188, "completions/min_length": 432.0, "completions/min_terminated_length": 432.0, "epoch": 0.5223880597014925, "frac_reward_zero_std": 0.5, "grad_norm": 0.44903701543807983, "kl": 0.0, "learning_rate": 2.4309868875086263e-07, "loss": 0.0056, "num_tokens": 49786318.0, "reward": 1.0499999523162842, "reward_std": 0.27386125922203064, "rewards/correctness_reward_func/mean": 0.75, "rewards/correctness_reward_func/std": 0.45226702094078064, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1540 }, { "completion_length": 2043.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2650.0, "completions/max_terminated_length": 2650.0, "completions/mean_length": 2043.5833740234375, "completions/mean_terminated_length": 2043.5833740234375, "completions/min_length": 1495.0, "completions/min_terminated_length": 1495.0, "epoch": 0.5227272727272727, "frac_reward_zero_std": 0.5, "grad_norm": 0.11377552896738052, "kl": 0.0, "learning_rate": 2.4292615596963424e-07, "loss": -0.002, "num_tokens": 49822931.0, "reward": 1.25, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1541 }, { "completion_length": 1487.5833740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 3299.0, "completions/mean_length": 2036.666748046875, "completions/mean_terminated_length": 1622.8182373046875, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 0.5230664857530529, "frac_reward_zero_std": 0.5, "grad_norm": 0.5692168474197388, "kl": NaN, "learning_rate": 2.427536231884058e-07, "loss": -0.0177, "num_tokens": 49854534.0, "reward": 0.9250000715255737, "reward_std": 0.2602883279323578, "rewards/correctness_reward_func/mean": 0.6500000357627869, "rewards/correctness_reward_func/std": 0.4833594560623169, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 1542 }, { "completion_length": 892.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1467.0, "completions/max_terminated_length": 1467.0, "completions/mean_length": 892.9166870117188, "completions/mean_terminated_length": 892.9166870117188, "completions/min_length": 483.0, "completions/min_terminated_length": 483.0, "epoch": 0.5234056987788331, "frac_reward_zero_std": 1.0, "grad_norm": 1.1185508697053592e-07, "kl": 0.0, "learning_rate": 2.4258109040717734e-07, "loss": 0.0, "num_tokens": 49874297.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1543 }, { "completion_length": 1050.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2042.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1050.416748046875, "completions/mean_terminated_length": 1050.416748046875, "completions/min_length": 460.0, "completions/min_terminated_length": 460.0, "epoch": 0.5237449118046132, "frac_reward_zero_std": 0.5, "grad_norm": 0.06605172157287598, "kl": 0.0, "learning_rate": 2.424085576259489e-07, "loss": -0.0015, "num_tokens": 49899244.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1544 }, { "completion_length": 2098.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 4855.0, "completions/max_terminated_length": 4855.0, "completions/mean_length": 2098.0, "completions/mean_terminated_length": 2098.0, "completions/min_length": 651.0, "completions/min_terminated_length": 651.0, "epoch": 0.5240841248303935, "frac_reward_zero_std": 0.5, "grad_norm": 0.09212353825569153, "kl": 0.0, "learning_rate": 2.422360248447205e-07, "loss": -0.005, "num_tokens": 49935922.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1545 }, { "completion_length": 831.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 2174.0, "completions/max_terminated_length": 2174.0, "completions/mean_length": 831.25, "completions/mean_terminated_length": 831.25, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 0.5244233378561737, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.4206349206349205e-07, "loss": 0.0, "num_tokens": 49959811.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1546 }, { "completion_length": 1225.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2272.0, "completions/max_terminated_length": 2272.0, "completions/mean_length": 1225.8333740234375, "completions/mean_terminated_length": 1225.8333740234375, "completions/min_length": 497.0, "completions/min_terminated_length": 497.0, "epoch": 0.5247625508819539, "frac_reward_zero_std": 0.5, "grad_norm": 0.05238298699259758, "kl": 0.0, "learning_rate": 2.418909592822636e-07, "loss": -0.0022, "num_tokens": 49987583.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1547 }, { "completion_length": 771.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1534.0, "completions/max_terminated_length": 1534.0, "completions/mean_length": 771.6666870117188, "completions/mean_terminated_length": 771.6666870117188, "completions/min_length": 363.0, "completions/min_terminated_length": 363.0, "epoch": 0.5251017639077341, "frac_reward_zero_std": 0.5, "grad_norm": 0.05934571102261543, "kl": 0.0, "learning_rate": 2.417184265010352e-07, "loss": 0.0, "num_tokens": 50011399.0, "reward": 1.2166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298573225736618, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1548 }, { "completion_length": 3110.75, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5878.0, "completions/mean_length": 3659.83349609375, "completions/mean_terminated_length": 3393.545654296875, "completions/min_length": 1928.0, "completions/min_terminated_length": 1928.0, "epoch": 0.5254409769335142, "frac_reward_zero_std": 0.0, "grad_norm": 0.14363695681095123, "kl": NaN, "learning_rate": 2.4154589371980677e-07, "loss": -0.0153, "num_tokens": 50057884.0, "reward": 0.7083333730697632, "reward_std": 0.1128770112991333, "rewards/correctness_reward_func/mean": 0.43333330750465393, "rewards/correctness_reward_func/std": 0.45792683959007263, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 1549 }, { "completion_length": 1343.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2948.0, "completions/max_terminated_length": 2948.0, "completions/mean_length": 1343.8333740234375, "completions/mean_terminated_length": 1343.8333740234375, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "epoch": 0.5257801899592944, "frac_reward_zero_std": 0.5, "grad_norm": 0.05717303976416588, "kl": 0.0, "learning_rate": 2.413733609385783e-07, "loss": 0.0009, "num_tokens": 50084330.0, "reward": 1.2333333492279053, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.9333333373069763, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1550 }, { "completion_length": 2794.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 5176.0, "completions/max_terminated_length": 5176.0, "completions/mean_length": 2794.08349609375, "completions/mean_terminated_length": 2794.08349609375, "completions/min_length": 1656.0, "completions/min_terminated_length": 1656.0, "epoch": 0.5261194029850746, "frac_reward_zero_std": 0.5, "grad_norm": 0.6242930293083191, "kl": 0.0, "learning_rate": 2.4120082815734987e-07, "loss": -0.007, "num_tokens": 50131029.0, "reward": 0.833333432674408, "reward_std": 0.20655912160873413, "rewards/correctness_reward_func/mean": 0.5333333611488342, "rewards/correctness_reward_func/std": 0.3938928246498108, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1551 }, { "completion_length": 1565.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3863.0, "completions/max_terminated_length": 3863.0, "completions/mean_length": 1565.416748046875, "completions/mean_terminated_length": 1565.416748046875, "completions/min_length": 550.0, "completions/min_terminated_length": 550.0, "epoch": 0.5264586160108549, "frac_reward_zero_std": 0.5, "grad_norm": 0.0745588019490242, "kl": 0.0, "learning_rate": 2.410282953761215e-07, "loss": -0.0005, "num_tokens": 50163518.0, "reward": 0.2875000238418579, "reward_std": 0.03061862289905548, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 1552 }, { "completion_length": 790.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1737.0, "completions/max_terminated_length": 1737.0, "completions/mean_length": 790.6666870117188, "completions/mean_terminated_length": 790.6666870117188, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "epoch": 0.526797829036635, "frac_reward_zero_std": 0.5, "grad_norm": 0.27708619832992554, "kl": 0.0, "learning_rate": 2.4085576259489303e-07, "loss": -0.0006, "num_tokens": 50187106.0, "reward": 0.833333432674408, "reward_std": 0.20655912160873413, "rewards/correctness_reward_func/mean": 0.5333333611488342, "rewards/correctness_reward_func/std": 0.3938928246498108, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1553 }, { "completion_length": 1021.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 2314.0, "completions/mean_length": 1570.0833740234375, "completions/mean_terminated_length": 1113.8182373046875, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "epoch": 0.5271370420624152, "frac_reward_zero_std": 0.0, "grad_norm": 0.3612026870250702, "kl": NaN, "learning_rate": 2.406832298136646e-07, "loss": -0.028, "num_tokens": 50208886.0, "reward": 1.024999976158142, "reward_std": 0.5235602855682373, "rewards/correctness_reward_func/mean": 0.75, "rewards/correctness_reward_func/std": 0.45226702094078064, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 1554 }, { "completion_length": 2601.33349609375, "completions/clipped_ratio": 0.0, "completions/max_length": 5796.0, "completions/max_terminated_length": 5796.0, "completions/mean_length": 2601.33349609375, "completions/mean_terminated_length": 2601.33349609375, "completions/min_length": 724.0, "completions/min_terminated_length": 724.0, "epoch": 0.5274762550881954, "frac_reward_zero_std": 0.5, "grad_norm": 0.6916945576667786, "kl": 0.0, "learning_rate": 2.4051069703243614e-07, "loss": 0.0153, "num_tokens": 50255186.0, "reward": 1.0333335399627686, "reward_std": 0.25819891691207886, "rewards/correctness_reward_func/mean": 0.7333332896232605, "rewards/correctness_reward_func/std": 0.35505014657974243, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1555 }, { "completion_length": 733.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1166.0, "completions/max_terminated_length": 1166.0, "completions/mean_length": 733.25, "completions/mean_terminated_length": 733.25, "completions/min_length": 557.0, "completions/min_terminated_length": 557.0, "epoch": 0.5278154681139756, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.4033816425120774e-07, "loss": 0.0, "num_tokens": 50276399.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1556 }, { "completion_length": 739.5000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1087.0, "completions/max_terminated_length": 1087.0, "completions/mean_length": 739.5, "completions/mean_terminated_length": 739.5, "completions/min_length": 461.0, "completions/min_terminated_length": 461.0, "epoch": 0.5281546811397557, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.401656314699793e-07, "loss": 0.0, "num_tokens": 50294819.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1557 }, { "completion_length": 2062.8333740234375, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 6261.0, "completions/mean_length": 3161.0, "completions/mean_terminated_length": 2475.400146484375, "completions/min_length": 550.0, "completions/min_terminated_length": 550.0, "epoch": 0.5284938941655359, "frac_reward_zero_std": 0.5, "grad_norm": 0.3203006684780121, "kl": NaN, "learning_rate": 2.3999309868875085e-07, "loss": -0.0255, "num_tokens": 50335593.0, "reward": 0.25, "reward_std": 0.0774596780538559, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.2500000298023224, "rewards/format_reward_func/std": 0.11677484214305878, "step": 1558 }, { "completion_length": 1212.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 4286.0, "completions/max_terminated_length": 4286.0, "completions/mean_length": 1212.166748046875, "completions/mean_terminated_length": 1212.166748046875, "completions/min_length": 389.0, "completions/min_terminated_length": 389.0, "epoch": 0.5288331071913162, "frac_reward_zero_std": 0.5, "grad_norm": 0.569848358631134, "kl": 0.0, "learning_rate": 2.398205659075224e-07, "loss": 0.0299, "num_tokens": 50355695.0, "reward": 1.1166667938232422, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.8166666030883789, "rewards/correctness_reward_func/std": 0.27579087018966675, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1559 }, { "completion_length": 1316.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2042.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1316.75, "completions/mean_terminated_length": 1316.75, "completions/min_length": 771.0, "completions/min_terminated_length": 771.0, "epoch": 0.5291723202170964, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.3964803312629395e-07, "loss": 0.0, "num_tokens": 50380562.0, "reward": 0.30000001192092896, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1560 }, { "completion_length": 1584.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3022.0, "completions/max_terminated_length": 3022.0, "completions/mean_length": 1584.75, "completions/mean_terminated_length": 1584.75, "completions/min_length": 838.0, "completions/min_terminated_length": 838.0, "epoch": 0.5295115332428765, "frac_reward_zero_std": 0.5, "grad_norm": 0.10423298180103302, "kl": 0.0, "learning_rate": 2.3947550034506556e-07, "loss": -0.0038, "num_tokens": 50415149.0, "reward": 1.1666667461395264, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1561 }, { "completion_length": 1854.416748046875, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 5182.0, "completions/mean_length": 2952.58349609375, "completions/mean_terminated_length": 2225.300048828125, "completions/min_length": 647.0, "completions/min_terminated_length": 647.0, "epoch": 0.5298507462686567, "frac_reward_zero_std": 0.5, "grad_norm": 0.1793835163116455, "kl": NaN, "learning_rate": 2.393029675638371e-07, "loss": -0.0179, "num_tokens": 50446732.0, "reward": 0.25, "reward_std": 0.0774596780538559, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.11677484214305878, "step": 1562 }, { "completion_length": 1560.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2934.0, "completions/max_terminated_length": 2934.0, "completions/mean_length": 1560.3333740234375, "completions/mean_terminated_length": 1560.3333740234375, "completions/min_length": 784.0, "completions/min_terminated_length": 784.0, "epoch": 0.5301899592944369, "frac_reward_zero_std": 0.0, "grad_norm": 0.2847871780395508, "kl": 0.0, "learning_rate": 2.391304347826087e-07, "loss": 0.005, "num_tokens": 50477396.0, "reward": 0.8333333730697632, "reward_std": 0.21493908762931824, "rewards/correctness_reward_func/mean": 0.5333333611488342, "rewards/correctness_reward_func/std": 0.47736650705337524, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1563 }, { "completion_length": 1045.4167175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 2149.0, "completions/max_terminated_length": 2149.0, "completions/mean_length": 1045.416748046875, "completions/mean_terminated_length": 1045.416748046875, "completions/min_length": 481.0, "completions/min_terminated_length": 481.0, "epoch": 0.5305291723202171, "frac_reward_zero_std": 0.5, "grad_norm": 0.07635991275310516, "kl": 0.0, "learning_rate": 2.3895790200138027e-07, "loss": -0.0007, "num_tokens": 50507971.0, "reward": 0.7166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1564 }, { "completion_length": 965.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1726.0, "completions/max_terminated_length": 1726.0, "completions/mean_length": 965.75, "completions/mean_terminated_length": 965.75, "completions/min_length": 600.0, "completions/min_terminated_length": 600.0, "epoch": 0.5308683853459973, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.387853692201518e-07, "loss": 0.0, "num_tokens": 50533834.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1565 }, { "completion_length": 1010.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2363.0, "completions/max_terminated_length": 2363.0, "completions/mean_length": 1010.5, "completions/mean_terminated_length": 1010.5, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.5312075983717774, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.386128364389234e-07, "loss": 0.0, "num_tokens": 50558182.0, "reward": 0.30000001192092896, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1566 }, { "completion_length": 1162.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2623.0, "completions/max_terminated_length": 2623.0, "completions/mean_length": 1162.5, "completions/mean_terminated_length": 1162.5, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "epoch": 0.5315468113975577, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.38440303657695e-07, "loss": 0.0, "num_tokens": 50586544.0, "reward": 0.30000001192092896, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1567 }, { "completion_length": 808.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1030.0, "completions/max_terminated_length": 1030.0, "completions/mean_length": 808.4166870117188, "completions/mean_terminated_length": 808.4166870117188, "completions/min_length": 424.0, "completions/min_terminated_length": 424.0, "epoch": 0.5318860244233379, "frac_reward_zero_std": 1.0, "grad_norm": 1.3751532890182716e-07, "kl": 0.0, "learning_rate": 2.382677708764665e-07, "loss": 0.0, "num_tokens": 50611611.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1568 }, { "completion_length": 1174.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2854.0, "completions/max_terminated_length": 2854.0, "completions/mean_length": 1174.666748046875, "completions/mean_terminated_length": 1174.666748046875, "completions/min_length": 543.0, "completions/min_terminated_length": 543.0, "epoch": 0.5322252374491181, "frac_reward_zero_std": 0.5, "grad_norm": 0.08520031720399857, "kl": 0.0, "learning_rate": 2.3809523809523806e-07, "loss": -0.002, "num_tokens": 50635625.0, "reward": 1.2666666507720947, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1569 }, { "completion_length": 1316.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4972.0, "completions/max_terminated_length": 4972.0, "completions/mean_length": 1316.8333740234375, "completions/mean_terminated_length": 1316.8333740234375, "completions/min_length": 443.0, "completions/min_terminated_length": 443.0, "epoch": 0.5325644504748982, "frac_reward_zero_std": 0.0, "grad_norm": 0.6034250259399414, "kl": 0.0, "learning_rate": 2.3792270531400964e-07, "loss": 0.0373, "num_tokens": 50660097.0, "reward": 1.2000000476837158, "reward_std": 0.24494895339012146, "rewards/correctness_reward_func/mean": 0.9000000357627869, "rewards/correctness_reward_func/std": 0.28919950127601624, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1570 }, { "completion_length": 1763.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3940.0, "completions/max_terminated_length": 3940.0, "completions/mean_length": 1763.3333740234375, "completions/mean_terminated_length": 1763.3333740234375, "completions/min_length": 479.0, "completions/min_terminated_length": 479.0, "epoch": 0.5329036635006784, "frac_reward_zero_std": 0.5, "grad_norm": 0.05989132449030876, "kl": 0.0, "learning_rate": 2.3775017253278122e-07, "loss": -0.0011, "num_tokens": 50696605.0, "reward": 0.7166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1571 }, { "completion_length": 1012.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 1464.0, "completions/max_terminated_length": 1464.0, "completions/mean_length": 1012.75, "completions/mean_terminated_length": 1012.75, "completions/min_length": 545.0, "completions/min_terminated_length": 545.0, "epoch": 0.5332428765264586, "frac_reward_zero_std": 0.5, "grad_norm": 0.07890410721302032, "kl": 0.0, "learning_rate": 2.375776397515528e-07, "loss": 0.0009, "num_tokens": 50723398.0, "reward": 1.2666666507720947, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1572 }, { "completion_length": 887.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1860.0, "completions/max_terminated_length": 1860.0, "completions/mean_length": 887.75, "completions/mean_terminated_length": 887.75, "completions/min_length": 573.0, "completions/min_terminated_length": 573.0, "epoch": 0.5335820895522388, "frac_reward_zero_std": 0.5, "grad_norm": 0.047239385545253754, "kl": 0.0, "learning_rate": 2.3740510697032435e-07, "loss": -0.0007, "num_tokens": 50743225.0, "reward": 0.7875000834465027, "reward_std": 0.03061862289905548, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 1573 }, { "completion_length": 1535.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2904.0, "completions/max_terminated_length": 2904.0, "completions/mean_length": 1535.8333740234375, "completions/mean_terminated_length": 1535.8333740234375, "completions/min_length": 487.0, "completions/min_terminated_length": 487.0, "epoch": 0.533921302578019, "frac_reward_zero_std": 0.5, "grad_norm": 0.36960774660110474, "kl": 0.0, "learning_rate": 2.3723257418909593e-07, "loss": -0.0044, "num_tokens": 50774135.0, "reward": 1.0208334922790527, "reward_std": 0.19391795992851257, "rewards/correctness_reward_func/mean": 0.7333333492279053, "rewards/correctness_reward_func/std": 0.2309401035308838, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 1574 }, { "completion_length": 701.5833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1231.0, "completions/max_terminated_length": 1231.0, "completions/mean_length": 701.5833740234375, "completions/mean_terminated_length": 701.5833740234375, "completions/min_length": 458.0, "completions/min_terminated_length": 458.0, "epoch": 0.5342605156037992, "frac_reward_zero_std": 0.5, "grad_norm": 0.0621166005730629, "kl": 0.0, "learning_rate": 2.3706004140786748e-07, "loss": 0.0004, "num_tokens": 50793480.0, "reward": 1.1666667461395264, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1575 }, { "completion_length": 2497.5, "completions/clipped_ratio": 0.0, "completions/max_length": 6501.0, "completions/max_terminated_length": 6501.0, "completions/mean_length": 2497.5, "completions/mean_terminated_length": 2497.5, "completions/min_length": 417.0, "completions/min_terminated_length": 417.0, "epoch": 0.5345997286295794, "frac_reward_zero_std": 0.0, "grad_norm": 0.8288344144821167, "kl": 0.0, "learning_rate": 2.3688750862663906e-07, "loss": -0.0758, "num_tokens": 50834712.0, "reward": 0.9500000476837158, "reward_std": 0.29902371764183044, "rewards/correctness_reward_func/mean": 0.6500000357627869, "rewards/correctness_reward_func/std": 0.40113475918769836, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1576 }, { "completion_length": 1455.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2673.0, "completions/max_terminated_length": 2673.0, "completions/mean_length": 1455.0833740234375, "completions/mean_terminated_length": 1455.0833740234375, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "epoch": 0.5349389416553596, "frac_reward_zero_std": 1.0, "grad_norm": 1.668771005824965e-07, "kl": 0.0, "learning_rate": 2.3671497584541062e-07, "loss": 0.0, "num_tokens": 50868631.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1577 }, { "completion_length": 984.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 2586.0, "completions/max_terminated_length": 2586.0, "completions/mean_length": 984.3333740234375, "completions/mean_terminated_length": 984.3333740234375, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.5352781546811397, "frac_reward_zero_std": 0.5, "grad_norm": 0.5765162110328674, "kl": 0.0, "learning_rate": 2.365424430641822e-07, "loss": 0.0182, "num_tokens": 50894057.0, "reward": 1.1166666746139526, "reward_std": 0.24832776188850403, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.3857303261756897, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1578 }, { "completion_length": 1107.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1895.0, "completions/max_terminated_length": 1895.0, "completions/mean_length": 1107.0, "completions/mean_terminated_length": 1107.0, "completions/min_length": 368.0, "completions/min_terminated_length": 368.0, "epoch": 0.5356173677069199, "frac_reward_zero_std": 0.0, "grad_norm": 0.2748401463031769, "kl": 0.0, "learning_rate": 2.3636991028295375e-07, "loss": 0.0138, "num_tokens": 50917373.0, "reward": 0.8500000834465027, "reward_std": 0.2557638883590698, "rewards/correctness_reward_func/mean": 0.550000011920929, "rewards/correctness_reward_func/std": 0.49082493782043457, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1579 }, { "completion_length": 936.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1986.0, "completions/max_terminated_length": 1986.0, "completions/mean_length": 936.9166870117188, "completions/mean_terminated_length": 936.9166870117188, "completions/min_length": 467.0, "completions/min_terminated_length": 467.0, "epoch": 0.5359565807327001, "frac_reward_zero_std": 0.0, "grad_norm": 0.10216211527585983, "kl": 0.0, "learning_rate": 2.361973775017253e-07, "loss": 0.0008, "num_tokens": 50941570.0, "reward": 1.1666667461395264, "reward_std": 0.09559707343578339, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1580 }, { "completion_length": 1026.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1739.0, "completions/max_terminated_length": 1739.0, "completions/mean_length": 1026.75, "completions/mean_terminated_length": 1026.75, "completions/min_length": 465.0, "completions/min_terminated_length": 465.0, "epoch": 0.5362957937584804, "frac_reward_zero_std": 0.5, "grad_norm": 0.08197390288114548, "kl": 0.0, "learning_rate": 2.3602484472049688e-07, "loss": -0.0001, "num_tokens": 50964121.0, "reward": 1.25, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1581 }, { "completion_length": 446.75, "completions/clipped_ratio": 0.0, "completions/max_length": 688.0, "completions/max_terminated_length": 688.0, "completions/mean_length": 446.75, "completions/mean_terminated_length": 446.75, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "epoch": 0.5366350067842606, "frac_reward_zero_std": 0.0, "grad_norm": 0.189627006649971, "kl": 0.0, "learning_rate": 2.3585231193926843e-07, "loss": 0.0019, "num_tokens": 50983072.0, "reward": 1.0500000715255737, "reward_std": 0.20412415266036987, "rewards/correctness_reward_func/mean": 0.75, "rewards/correctness_reward_func/std": 0.24308621883392334, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1582 }, { "completion_length": 1694.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 4547.0, "completions/max_terminated_length": 4547.0, "completions/mean_length": 1694.166748046875, "completions/mean_terminated_length": 1694.166748046875, "completions/min_length": 506.0, "completions/min_terminated_length": 506.0, "epoch": 0.5369742198100407, "frac_reward_zero_std": 0.5, "grad_norm": 0.5707111954689026, "kl": 0.0, "learning_rate": 2.3567977915804e-07, "loss": 0.0193, "num_tokens": 51019398.0, "reward": 1.0833332538604736, "reward_std": 0.222860187292099, "rewards/correctness_reward_func/mean": 0.7833333015441895, "rewards/correctness_reward_func/std": 0.37618499994277954, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1583 }, { "completion_length": 2072.5, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 6218.0, "completions/mean_length": 3719.75, "completions/mean_terminated_length": 2763.333251953125, "completions/min_length": 810.0, "completions/min_terminated_length": 810.0, "epoch": 0.5373134328358209, "frac_reward_zero_std": 0.0, "grad_norm": 0.5051462054252625, "kl": NaN, "learning_rate": 2.3550724637681156e-07, "loss": -0.0162, "num_tokens": 51054174.0, "reward": 0.5708333253860474, "reward_std": 0.2370387315750122, "rewards/correctness_reward_func/mean": 0.3333333432674408, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.23750001192092896, "rewards/format_reward_func/std": 0.11894422769546509, "step": 1584 }, { "completion_length": 708.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1324.0, "completions/max_terminated_length": 1324.0, "completions/mean_length": 708.5833740234375, "completions/mean_terminated_length": 708.5833740234375, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "epoch": 0.5376526458616011, "frac_reward_zero_std": 1.0, "grad_norm": 1.444353188162495e-07, "kl": 0.0, "learning_rate": 2.3533471359558314e-07, "loss": 0.0, "num_tokens": 51076321.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1585 }, { "completion_length": 1118.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2052.0, "completions/max_terminated_length": 2052.0, "completions/mean_length": 1118.5, "completions/mean_terminated_length": 1118.5, "completions/min_length": 529.0, "completions/min_terminated_length": 529.0, "epoch": 0.5379918588873813, "frac_reward_zero_std": 1.0, "grad_norm": 1.8097115628279425e-07, "kl": 0.0, "learning_rate": 2.3516218081435472e-07, "loss": 0.0, "num_tokens": 51097681.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1586 }, { "completion_length": 597.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1011.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 597.4166870117188, "completions/mean_terminated_length": 597.4166870117188, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.5383310719131614, "frac_reward_zero_std": 1.0, "grad_norm": 8.901039194597615e-08, "kl": 0.0, "learning_rate": 2.349896480331263e-07, "loss": 0.0, "num_tokens": 51119436.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1587 }, { "completion_length": 736.5833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1032.0, "completions/max_terminated_length": 1032.0, "completions/mean_length": 736.5833740234375, "completions/mean_terminated_length": 736.5833740234375, "completions/min_length": 395.0, "completions/min_terminated_length": 395.0, "epoch": 0.5386702849389416, "frac_reward_zero_std": 0.5, "grad_norm": 0.06285547465085983, "kl": 0.0, "learning_rate": 2.3481711525189786e-07, "loss": -0.0011, "num_tokens": 51137029.0, "reward": 0.7666666507720947, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1588 }, { "completion_length": 1015.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1986.0, "completions/max_terminated_length": 1986.0, "completions/mean_length": 1015.3333740234375, "completions/mean_terminated_length": 1015.3333740234375, "completions/min_length": 352.0, "completions/min_terminated_length": 352.0, "epoch": 0.5390094979647219, "frac_reward_zero_std": 0.0, "grad_norm": 0.5281247496604919, "kl": 0.0, "learning_rate": 2.3464458247066943e-07, "loss": 0.0081, "num_tokens": 51163025.0, "reward": 1.133333444595337, "reward_std": 0.24494896829128265, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.2806917726993561, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1589 }, { "completion_length": 1009.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2114.0, "completions/max_terminated_length": 2114.0, "completions/mean_length": 1009.8333740234375, "completions/mean_terminated_length": 1009.8333740234375, "completions/min_length": 653.0, "completions/min_terminated_length": 653.0, "epoch": 0.5393487109905021, "frac_reward_zero_std": 0.0, "grad_norm": 0.08285267651081085, "kl": 0.0, "learning_rate": 2.34472049689441e-07, "loss": -0.001, "num_tokens": 51185973.0, "reward": 1.2000000476837158, "reward_std": 0.08164963126182556, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1590 }, { "completion_length": 431.00001525878906, "completions/clipped_ratio": 0.0, "completions/max_length": 636.0, "completions/max_terminated_length": 636.0, "completions/mean_length": 431.0, "completions/mean_terminated_length": 431.0, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.5396879240162822, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.3429951690821257e-07, "loss": 0.0, "num_tokens": 51203601.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1591 }, { "completion_length": 944.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1336.0, "completions/max_terminated_length": 1336.0, "completions/mean_length": 944.8333740234375, "completions/mean_terminated_length": 944.8333740234375, "completions/min_length": 502.0, "completions/min_terminated_length": 502.0, "epoch": 0.5400271370420624, "frac_reward_zero_std": 1.0, "grad_norm": 1.6128075230881223e-07, "kl": 0.0, "learning_rate": 2.3412698412698412e-07, "loss": 0.0, "num_tokens": 51227779.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1592 }, { "completion_length": 1248.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2834.0, "completions/max_terminated_length": 2834.0, "completions/mean_length": 1248.166748046875, "completions/mean_terminated_length": 1248.166748046875, "completions/min_length": 551.0, "completions/min_terminated_length": 551.0, "epoch": 0.5403663500678426, "frac_reward_zero_std": 1.0, "grad_norm": 1.3582008762114128e-07, "kl": 0.0, "learning_rate": 2.3395445134575567e-07, "loss": 0.0, "num_tokens": 51254643.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1593 }, { "completion_length": 726.5, "completions/clipped_ratio": 0.0, "completions/max_length": 921.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 726.5, "completions/mean_terminated_length": 726.5, "completions/min_length": 524.0, "completions/min_terminated_length": 524.0, "epoch": 0.5407055630936228, "frac_reward_zero_std": 0.0, "grad_norm": 0.24478761851787567, "kl": 0.0, "learning_rate": 2.3378191856452725e-07, "loss": 0.0002, "num_tokens": 51275175.0, "reward": 0.8333333730697632, "reward_std": 0.21493908762931824, "rewards/correctness_reward_func/mean": 0.5333333015441895, "rewards/correctness_reward_func/std": 0.47736650705337524, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1594 }, { "completion_length": 909.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1394.0, "completions/max_terminated_length": 1394.0, "completions/mean_length": 909.25, "completions/mean_terminated_length": 909.25, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "epoch": 0.5410447761194029, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.336093857832988e-07, "loss": 0.0, "num_tokens": 51295338.0, "reward": 0.30000001192092896, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1595 }, { "completion_length": 1915.2501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 4938.0, "completions/max_terminated_length": 4938.0, "completions/mean_length": 1915.25, "completions/mean_terminated_length": 1915.25, "completions/min_length": 478.0, "completions/min_terminated_length": 478.0, "epoch": 0.5413839891451832, "frac_reward_zero_std": 1.0, "grad_norm": 2.0728261063140963e-07, "kl": 0.0, "learning_rate": 2.3343685300207038e-07, "loss": 0.0, "num_tokens": 51329151.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1596 }, { "completion_length": 434.25, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/max_terminated_length": 555.0, "completions/mean_length": 434.25, "completions/mean_terminated_length": 434.25, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.5417232021709634, "frac_reward_zero_std": 0.5, "grad_norm": 0.05525250732898712, "kl": 0.0, "learning_rate": 2.3326432022084194e-07, "loss": -0.0002, "num_tokens": 51342966.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1597 }, { "completion_length": 900.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1601.0, "completions/max_terminated_length": 1601.0, "completions/mean_length": 900.6666870117188, "completions/mean_terminated_length": 900.6666870117188, "completions/min_length": 494.0, "completions/min_terminated_length": 494.0, "epoch": 0.5420624151967436, "frac_reward_zero_std": 0.5, "grad_norm": 0.07637582719326019, "kl": 0.0, "learning_rate": 2.3309178743961352e-07, "loss": 0.0015, "num_tokens": 51365102.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1598 }, { "completion_length": 1064.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2222.0, "completions/max_terminated_length": 2222.0, "completions/mean_length": 1064.5833740234375, "completions/mean_terminated_length": 1064.5833740234375, "completions/min_length": 404.0, "completions/min_terminated_length": 404.0, "epoch": 0.5424016282225237, "frac_reward_zero_std": 0.5, "grad_norm": 0.06685810536146164, "kl": 0.0, "learning_rate": 2.3291925465838507e-07, "loss": -0.0003, "num_tokens": 51387297.0, "reward": 1.25, "reward_std": 0.05477222427725792, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1599 }, { "completion_length": 1094.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2589.0, "completions/max_terminated_length": 2589.0, "completions/mean_length": 1094.0833740234375, "completions/mean_terminated_length": 1094.0833740234375, "completions/min_length": 480.0, "completions/min_terminated_length": 480.0, "epoch": 0.5427408412483039, "frac_reward_zero_std": 0.5, "grad_norm": 0.10844426602125168, "kl": 0.0, "learning_rate": 2.3274672187715665e-07, "loss": 0.0018, "num_tokens": 51412378.0, "reward": 1.25, "reward_std": 0.05477222427725792, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1600 }, { "completion_length": 720.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1340.0, "completions/max_terminated_length": 1340.0, "completions/mean_length": 720.6666870117188, "completions/mean_terminated_length": 720.6666870117188, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 0.5430800542740841, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.325741890959282e-07, "loss": 0.0, "num_tokens": 51431214.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1601 }, { "completion_length": 2000.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4265.0, "completions/max_terminated_length": 4265.0, "completions/mean_length": 2000.666748046875, "completions/mean_terminated_length": 2000.666748046875, "completions/min_length": 608.0, "completions/min_terminated_length": 608.0, "epoch": 0.5434192672998643, "frac_reward_zero_std": 0.5, "grad_norm": 0.09570232778787613, "kl": 0.0, "learning_rate": 2.324016563146998e-07, "loss": 0.0013, "num_tokens": 51465074.0, "reward": 1.2666666507720947, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1602 }, { "completion_length": 2326.166748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6557.0, "completions/mean_length": 2875.25, "completions/mean_terminated_length": 2537.636474609375, "completions/min_length": 509.0, "completions/min_terminated_length": 509.0, "epoch": 0.5437584803256446, "frac_reward_zero_std": 0.0, "grad_norm": 0.9599380493164062, "kl": NaN, "learning_rate": 2.3222912353347136e-07, "loss": -0.0635, "num_tokens": 51503914.0, "reward": 0.5583333969116211, "reward_std": 0.46779459714889526, "rewards/correctness_reward_func/mean": 0.28333333134651184, "rewards/correctness_reward_func/std": 0.4217568039894104, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 1603 }, { "completion_length": 1160.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1765.0, "completions/max_terminated_length": 1765.0, "completions/mean_length": 1160.416748046875, "completions/mean_terminated_length": 1160.416748046875, "completions/min_length": 645.0, "completions/min_terminated_length": 645.0, "epoch": 0.5440976933514247, "frac_reward_zero_std": 1.0, "grad_norm": 1.577116677253798e-07, "kl": 0.0, "learning_rate": 2.320565907522429e-07, "loss": 0.0, "num_tokens": 51536079.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1604 }, { "completion_length": 690.0000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 927.0, "completions/max_terminated_length": 927.0, "completions/mean_length": 690.0, "completions/mean_terminated_length": 690.0, "completions/min_length": 470.0, "completions/min_terminated_length": 470.0, "epoch": 0.5444369063772049, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.318840579710145e-07, "loss": 0.0, "num_tokens": 51558849.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1605 }, { "completion_length": 1505.25, "completions/clipped_ratio": 0.0, "completions/max_length": 4236.0, "completions/max_terminated_length": 4236.0, "completions/mean_length": 1505.25, "completions/mean_terminated_length": 1505.25, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 0.5447761194029851, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.3171152518978604e-07, "loss": 0.0, "num_tokens": 51589086.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1606 }, { "completion_length": 1972.8333740234375, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 6413.0, "completions/mean_length": 3071.0, "completions/mean_terminated_length": 2367.400146484375, "completions/min_length": 637.0, "completions/min_terminated_length": 637.0, "epoch": 0.5451153324287653, "frac_reward_zero_std": 0.5, "grad_norm": 0.1497505009174347, "kl": NaN, "learning_rate": 2.3153899240855762e-07, "loss": -0.0118, "num_tokens": 51624868.0, "reward": 0.7749999761581421, "reward_std": 0.06123724579811096, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 1607 }, { "completion_length": 1416.4166870117188, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 3594.0, "completions/mean_length": 1965.5, "completions/mean_terminated_length": 1545.181884765625, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "epoch": 0.5454545454545454, "frac_reward_zero_std": 0.5, "grad_norm": 0.5870721936225891, "kl": NaN, "learning_rate": 2.3136645962732918e-07, "loss": -0.0338, "num_tokens": 51658851.0, "reward": 1.0416667461395264, "reward_std": 0.24983328580856323, "rewards/correctness_reward_func/mean": 0.7666666507720947, "rewards/correctness_reward_func/std": 0.3700941801071167, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 1608 }, { "completion_length": 940.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2248.0, "completions/max_terminated_length": 2248.0, "completions/mean_length": 940.8333740234375, "completions/mean_terminated_length": 940.8333740234375, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.5457937584803256, "frac_reward_zero_std": 1.0, "grad_norm": 2.659915878666652e-07, "kl": 0.0, "learning_rate": 2.3119392684610076e-07, "loss": 0.0, "num_tokens": 51683317.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1609 }, { "completion_length": 490.41668701171875, "completions/clipped_ratio": 0.0, "completions/max_length": 773.0, "completions/max_terminated_length": 773.0, "completions/mean_length": 490.41668701171875, "completions/mean_terminated_length": 490.41668701171875, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.5461329715061058, "frac_reward_zero_std": 1.0, "grad_norm": 1.5120255625333812e-07, "kl": 0.0, "learning_rate": 2.310213940648723e-07, "loss": 0.0, "num_tokens": 51701676.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1610 }, { "completion_length": 1286.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2710.0, "completions/max_terminated_length": 2710.0, "completions/mean_length": 1286.916748046875, "completions/mean_terminated_length": 1286.916748046875, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "epoch": 0.5464721845318861, "frac_reward_zero_std": 0.0, "grad_norm": 0.4060056507587433, "kl": 0.0, "learning_rate": 2.308488612836439e-07, "loss": -0.001, "num_tokens": 51723557.0, "reward": 1.0833334922790527, "reward_std": 0.2270147055387497, "rewards/correctness_reward_func/mean": 0.7833333015441895, "rewards/correctness_reward_func/std": 0.26227444410324097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1611 }, { "completion_length": 1170.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1988.0, "completions/max_terminated_length": 1988.0, "completions/mean_length": 1170.0833740234375, "completions/mean_terminated_length": 1170.0833740234375, "completions/min_length": 711.0, "completions/min_terminated_length": 711.0, "epoch": 0.5468113975576662, "frac_reward_zero_std": 0.5, "grad_norm": 0.06772714853286743, "kl": 0.0, "learning_rate": 2.3067632850241544e-07, "loss": -0.0005, "num_tokens": 51748116.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1612 }, { "completion_length": 796.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1508.0, "completions/max_terminated_length": 1508.0, "completions/mean_length": 796.9166870117188, "completions/mean_terminated_length": 796.9166870117188, "completions/min_length": 439.0, "completions/min_terminated_length": 439.0, "epoch": 0.5471506105834464, "frac_reward_zero_std": 0.5, "grad_norm": 0.08029603958129883, "kl": 0.0, "learning_rate": 2.3050379572118702e-07, "loss": 0.0008, "num_tokens": 51770651.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1613 }, { "completion_length": 1503.5833740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4233.0, "completions/mean_length": 2052.666748046875, "completions/mean_terminated_length": 1640.2728271484375, "completions/min_length": 437.0, "completions/min_terminated_length": 437.0, "epoch": 0.5474898236092266, "frac_reward_zero_std": 0.0, "grad_norm": 0.29853522777557373, "kl": NaN, "learning_rate": 2.3033126293995857e-07, "loss": -0.0037, "num_tokens": 51802308.0, "reward": 0.6875, "reward_std": 0.2306186407804489, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4972652792930603, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 1614 }, { "completion_length": 946.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2026.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 946.4166870117188, "completions/mean_terminated_length": 946.4166870117188, "completions/min_length": 418.0, "completions/min_terminated_length": 418.0, "epoch": 0.5478290366350068, "frac_reward_zero_std": 0.5, "grad_norm": 0.3336067795753479, "kl": 0.0, "learning_rate": 2.3015873015873013e-07, "loss": -0.0086, "num_tokens": 51825011.0, "reward": 1.183333396911621, "reward_std": 0.19407901167869568, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1615 }, { "completion_length": 2460.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 6539.0, "completions/max_terminated_length": 6539.0, "completions/mean_length": 2460.666748046875, "completions/mean_terminated_length": 2460.666748046875, "completions/min_length": 697.0, "completions/min_terminated_length": 697.0, "epoch": 0.5481682496607869, "frac_reward_zero_std": 0.0, "grad_norm": 0.4294710159301758, "kl": 0.0, "learning_rate": 2.299861973775017e-07, "loss": 0.0052, "num_tokens": 51867331.0, "reward": 0.8166667819023132, "reward_std": 0.21807155013084412, "rewards/correctness_reward_func/mean": 0.5166666507720947, "rewards/correctness_reward_func/std": 0.4628632962703705, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1616 }, { "completion_length": 817.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1072.0, "completions/max_terminated_length": 1072.0, "completions/mean_length": 817.3333740234375, "completions/mean_terminated_length": 817.3333740234375, "completions/min_length": 623.0, "completions/min_terminated_length": 623.0, "epoch": 0.5485074626865671, "frac_reward_zero_std": 0.5, "grad_norm": 0.2997523844242096, "kl": 0.0, "learning_rate": 2.2981366459627326e-07, "loss": -0.0003, "num_tokens": 51887069.0, "reward": 0.7166666984558105, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.5149286985397339, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1617 }, { "completion_length": 758.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1319.0, "completions/max_terminated_length": 1319.0, "completions/mean_length": 758.75, "completions/mean_terminated_length": 758.75, "completions/min_length": 436.0, "completions/min_terminated_length": 436.0, "epoch": 0.5488466757123474, "frac_reward_zero_std": 1.0, "grad_norm": 1.899996107113111e-07, "kl": 0.0, "learning_rate": 2.2964113181504486e-07, "loss": 0.0, "num_tokens": 51907670.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1618 }, { "completion_length": 746.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1377.0, "completions/max_terminated_length": 1377.0, "completions/mean_length": 746.8333740234375, "completions/mean_terminated_length": 746.8333740234375, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "epoch": 0.5491858887381276, "frac_reward_zero_std": 0.5, "grad_norm": 0.0592019259929657, "kl": 0.0, "learning_rate": 2.2946859903381642e-07, "loss": -0.0007, "num_tokens": 51930882.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1619 }, { "completion_length": 3236.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4992.0, "completions/max_terminated_length": 4992.0, "completions/mean_length": 3236.416748046875, "completions/mean_terminated_length": 3236.416748046875, "completions/min_length": 1602.0, "completions/min_terminated_length": 1602.0, "epoch": 0.5495251017639078, "frac_reward_zero_std": 1.0, "grad_norm": 3.803697836701758e-07, "kl": 0.0, "learning_rate": 2.29296066252588e-07, "loss": 0.0, "num_tokens": 51981725.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1620 }, { "completion_length": 1432.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2628.0, "completions/max_terminated_length": 2628.0, "completions/mean_length": 1432.0, "completions/mean_terminated_length": 1432.0, "completions/min_length": 643.0, "completions/min_terminated_length": 643.0, "epoch": 0.5498643147896879, "frac_reward_zero_std": 0.0, "grad_norm": 0.09942618757486343, "kl": 0.0, "learning_rate": 2.2912353347135955e-07, "loss": -0.0013, "num_tokens": 52012367.0, "reward": 1.2666666507720947, "reward_std": 0.08164961636066437, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1621 }, { "completion_length": 2027.166748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5655.0, "completions/mean_length": 2576.25, "completions/mean_terminated_length": 2211.45458984375, "completions/min_length": 355.0, "completions/min_terminated_length": 355.0, "epoch": 0.5502035278154681, "frac_reward_zero_std": 0.0, "grad_norm": 0.42404603958129883, "kl": NaN, "learning_rate": 2.2895100069013113e-07, "loss": -0.0596, "num_tokens": 52046671.0, "reward": 0.9833334684371948, "reward_std": 0.2473839521408081, "rewards/correctness_reward_func/mean": 0.6833333969116211, "rewards/correctness_reward_func/std": 0.32427075505256653, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1622 }, { "completion_length": 694.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1014.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 694.25, "completions/mean_terminated_length": 694.25, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "epoch": 0.5505427408412483, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.2877846790890268e-07, "loss": 0.0, "num_tokens": 52067140.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1623 }, { "completion_length": 1523.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2600.0, "completions/max_terminated_length": 2600.0, "completions/mean_length": 1523.3333740234375, "completions/mean_terminated_length": 1523.3333740234375, "completions/min_length": 614.0, "completions/min_terminated_length": 614.0, "epoch": 0.5508819538670285, "frac_reward_zero_std": 1.0, "grad_norm": 2.412836579424038e-07, "kl": 0.0, "learning_rate": 2.2860593512767426e-07, "loss": 0.0, "num_tokens": 52098290.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1624 }, { "completion_length": 1571.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3638.0, "completions/max_terminated_length": 3638.0, "completions/mean_length": 1571.416748046875, "completions/mean_terminated_length": 1571.416748046875, "completions/min_length": 644.0, "completions/min_terminated_length": 644.0, "epoch": 0.5512211668928086, "frac_reward_zero_std": 0.5, "grad_norm": 0.10258523374795914, "kl": 0.0, "learning_rate": 2.284334023464458e-07, "loss": 0.0006, "num_tokens": 52131601.0, "reward": 1.133333444595337, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1625 }, { "completion_length": 2699.5, "completions/clipped_ratio": 0.0, "completions/max_length": 6251.0, "completions/max_terminated_length": 6251.0, "completions/mean_length": 2699.5, "completions/mean_terminated_length": 2699.5, "completions/min_length": 466.0, "completions/min_terminated_length": 466.0, "epoch": 0.5515603799185889, "frac_reward_zero_std": 0.5, "grad_norm": 0.04761405289173126, "kl": 0.0, "learning_rate": 2.2826086956521737e-07, "loss": 0.0008, "num_tokens": 52178989.0, "reward": 0.7166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1626 }, { "completion_length": 1152.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 1426.0, "completions/max_terminated_length": 1426.0, "completions/mean_length": 1152.25, "completions/mean_terminated_length": 1152.25, "completions/min_length": 642.0, "completions/min_terminated_length": 642.0, "epoch": 0.5518995929443691, "frac_reward_zero_std": 0.5, "grad_norm": 0.33489876985549927, "kl": 0.0, "learning_rate": 2.2808833678398895e-07, "loss": -0.0038, "num_tokens": 52204168.0, "reward": 1.066666603088379, "reward_std": 0.20655910670757294, "rewards/correctness_reward_func/mean": 0.7666666507720947, "rewards/correctness_reward_func/std": 0.3700941503047943, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1627 }, { "completion_length": 946.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2044.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 946.9166870117188, "completions/mean_terminated_length": 946.9166870117188, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.5522388059701493, "frac_reward_zero_std": 0.5, "grad_norm": 0.10333229601383209, "kl": 0.0, "learning_rate": 2.279158040027605e-07, "loss": 0.0004, "num_tokens": 52225533.0, "reward": 1.2333333492279053, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.9333333373069763, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1628 }, { "completion_length": 2941.83349609375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5912.0, "completions/mean_length": 3490.916748046875, "completions/mean_terminated_length": 3209.272705078125, "completions/min_length": 1421.0, "completions/min_terminated_length": 1421.0, "epoch": 0.5525780189959294, "frac_reward_zero_std": 0.5, "grad_norm": 0.14109471440315247, "kl": NaN, "learning_rate": 2.2774327122153208e-07, "loss": -0.0136, "num_tokens": 52272367.0, "reward": 0.27500003576278687, "reward_std": 0.06123724579811096, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 1629 }, { "completion_length": 1454.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3363.0, "completions/max_terminated_length": 3363.0, "completions/mean_length": 1454.75, "completions/mean_terminated_length": 1454.75, "completions/min_length": 559.0, "completions/min_terminated_length": 559.0, "epoch": 0.5529172320217096, "frac_reward_zero_std": 0.0, "grad_norm": 0.11473068594932556, "kl": 0.0, "learning_rate": 2.2757073844030363e-07, "loss": 0.0002, "num_tokens": 52301440.0, "reward": 1.2166666984558105, "reward_std": 0.09246456623077393, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298573225736618, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1630 }, { "completion_length": 680.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1108.0, "completions/max_terminated_length": 1108.0, "completions/mean_length": 680.6666870117188, "completions/mean_terminated_length": 680.6666870117188, "completions/min_length": 439.0, "completions/min_terminated_length": 439.0, "epoch": 0.5532564450474898, "frac_reward_zero_std": 1.0, "grad_norm": 1.0094655777947992e-07, "kl": 0.0, "learning_rate": 2.273982056590752e-07, "loss": 0.0, "num_tokens": 52318800.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1631 }, { "completion_length": 726.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1087.0, "completions/max_terminated_length": 1087.0, "completions/mean_length": 726.9166870117188, "completions/mean_terminated_length": 726.9166870117188, "completions/min_length": 500.0, "completions/min_terminated_length": 500.0, "epoch": 0.55359565807327, "frac_reward_zero_std": 0.5, "grad_norm": 0.07460886240005493, "kl": 0.0, "learning_rate": 2.2722567287784676e-07, "loss": 0.0005, "num_tokens": 52342247.0, "reward": 1.2333333492279053, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.9333333373069763, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1632 }, { "completion_length": 1383.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3154.0, "completions/max_terminated_length": 3154.0, "completions/mean_length": 1383.666748046875, "completions/mean_terminated_length": 1383.666748046875, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "epoch": 0.5539348710990502, "frac_reward_zero_std": 0.5, "grad_norm": 0.10120590776205063, "kl": 0.0, "learning_rate": 2.2705314009661837e-07, "loss": 0.0006, "num_tokens": 52370953.0, "reward": 1.2666666507720947, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1633 }, { "completion_length": 888.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1359.0, "completions/max_terminated_length": 1359.0, "completions/mean_length": 888.75, "completions/mean_terminated_length": 888.75, "completions/min_length": 504.0, "completions/min_terminated_length": 504.0, "epoch": 0.5542740841248304, "frac_reward_zero_std": 0.5, "grad_norm": 0.4447871148586273, "kl": 0.0, "learning_rate": 2.2688060731538992e-07, "loss": 0.0071, "num_tokens": 52394806.0, "reward": 1.0499999523162842, "reward_std": 0.27386125922203064, "rewards/correctness_reward_func/mean": 0.75, "rewards/correctness_reward_func/std": 0.45226702094078064, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1634 }, { "completion_length": 1767.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4022.0, "completions/max_terminated_length": 4022.0, "completions/mean_length": 1767.8333740234375, "completions/mean_terminated_length": 1767.8333740234375, "completions/min_length": 534.0, "completions/min_terminated_length": 534.0, "epoch": 0.5546132971506106, "frac_reward_zero_std": 1.0, "grad_norm": 1.5388769725177553e-07, "kl": 0.0, "learning_rate": 2.267080745341615e-07, "loss": 0.0, "num_tokens": 52430192.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1635 }, { "completion_length": 813.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1433.0, "completions/max_terminated_length": 1433.0, "completions/mean_length": 813.3333740234375, "completions/mean_terminated_length": 813.3333740234375, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "epoch": 0.5549525101763908, "frac_reward_zero_std": 1.0, "grad_norm": 8.638762949431111e-08, "kl": 0.0, "learning_rate": 2.2653554175293305e-07, "loss": 0.0, "num_tokens": 52448442.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1636 }, { "completion_length": 825.5833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 2390.0, "completions/max_terminated_length": 2390.0, "completions/mean_length": 825.5833740234375, "completions/mean_terminated_length": 825.5833740234375, "completions/min_length": 498.0, "completions/min_terminated_length": 498.0, "epoch": 0.555291723202171, "frac_reward_zero_std": 0.5, "grad_norm": 0.07061552256345749, "kl": 0.0, "learning_rate": 2.263630089717046e-07, "loss": -0.0015, "num_tokens": 52469833.0, "reward": 1.1500000953674316, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.8500000834465027, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1637 }, { "completion_length": 1052.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 1929.0, "completions/max_terminated_length": 1929.0, "completions/mean_length": 1052.25, "completions/mean_terminated_length": 1052.25, "completions/min_length": 710.0, "completions/min_terminated_length": 710.0, "epoch": 0.5556309362279511, "frac_reward_zero_std": 0.0, "grad_norm": 0.11609834432601929, "kl": 0.0, "learning_rate": 2.2619047619047619e-07, "loss": 0.0027, "num_tokens": 52493812.0, "reward": 1.2333333492279053, "reward_std": 0.0955970510840416, "rewards/correctness_reward_func/mean": 0.9333333373069763, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1638 }, { "completion_length": 1251.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2502.0, "completions/max_terminated_length": 2502.0, "completions/mean_length": 1251.0833740234375, "completions/mean_terminated_length": 1251.0833740234375, "completions/min_length": 635.0, "completions/min_terminated_length": 635.0, "epoch": 0.5559701492537313, "frac_reward_zero_std": 0.5, "grad_norm": 0.43270978331565857, "kl": 0.0, "learning_rate": 2.2601794340924774e-07, "loss": -0.0027, "num_tokens": 52519355.0, "reward": 0.8666667342185974, "reward_std": 0.2581988573074341, "rewards/correctness_reward_func/mean": 0.5666666626930237, "rewards/correctness_reward_func/std": 0.42497774958610535, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1639 }, { "completion_length": 942.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1436.0, "completions/max_terminated_length": 1436.0, "completions/mean_length": 942.5, "completions/mean_terminated_length": 942.5, "completions/min_length": 572.0, "completions/min_terminated_length": 572.0, "epoch": 0.5563093622795116, "frac_reward_zero_std": 0.5, "grad_norm": 0.06681928783655167, "kl": 0.0, "learning_rate": 2.2584541062801932e-07, "loss": -0.0001, "num_tokens": 52541525.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1640 }, { "completion_length": 717.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1342.0, "completions/max_terminated_length": 1342.0, "completions/mean_length": 717.8333740234375, "completions/mean_terminated_length": 717.8333740234375, "completions/min_length": 367.0, "completions/min_terminated_length": 367.0, "epoch": 0.5566485753052918, "frac_reward_zero_std": 0.5, "grad_norm": 0.06881088763475418, "kl": 0.0, "learning_rate": 2.2567287784679087e-07, "loss": 0.0004, "num_tokens": 52559901.0, "reward": 1.25, "reward_std": 0.05477222427725792, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1641 }, { "completion_length": 2050.0001220703125, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 6425.0, "completions/mean_length": 3148.166748046875, "completions/mean_terminated_length": 2460.0, "completions/min_length": 477.0, "completions/min_terminated_length": 477.0, "epoch": 0.5569877883310719, "frac_reward_zero_std": 0.0, "grad_norm": 0.16281110048294067, "kl": NaN, "learning_rate": 2.2550034506556245e-07, "loss": -0.027, "num_tokens": 52601373.0, "reward": 0.7333333492279053, "reward_std": 0.11828448623418808, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.2500000298023224, "rewards/format_reward_func/std": 0.11677484214305878, "step": 1642 }, { "completion_length": 715.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1155.0, "completions/max_terminated_length": 1155.0, "completions/mean_length": 715.5833740234375, "completions/mean_terminated_length": 715.5833740234375, "completions/min_length": 413.0, "completions/min_terminated_length": 413.0, "epoch": 0.5573270013568521, "frac_reward_zero_std": 1.0, "grad_norm": 9.19285554346061e-08, "kl": 0.0, "learning_rate": 2.25327812284334e-07, "loss": 0.0, "num_tokens": 52622806.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1643 }, { "completion_length": 1442.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2655.0, "completions/max_terminated_length": 2655.0, "completions/mean_length": 1442.75, "completions/mean_terminated_length": 1442.75, "completions/min_length": 653.0, "completions/min_terminated_length": 653.0, "epoch": 0.5576662143826323, "frac_reward_zero_std": 0.5, "grad_norm": 0.11178168654441833, "kl": 0.0, "learning_rate": 2.2515527950310558e-07, "loss": 0.0029, "num_tokens": 52655473.0, "reward": 1.25, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1644 }, { "completion_length": 831.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1184.0, "completions/max_terminated_length": 1184.0, "completions/mean_length": 831.75, "completions/mean_terminated_length": 831.75, "completions/min_length": 545.0, "completions/min_terminated_length": 545.0, "epoch": 0.5580054274084125, "frac_reward_zero_std": 0.5, "grad_norm": 0.07372111827135086, "kl": 0.0, "learning_rate": 2.2498274672187713e-07, "loss": -0.0004, "num_tokens": 52678876.0, "reward": 1.2666666507720947, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1645 }, { "completion_length": 466.50001525878906, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/max_terminated_length": 597.0, "completions/mean_length": 466.5, "completions/mean_terminated_length": 466.5, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 0.5583446404341926, "frac_reward_zero_std": 0.5, "grad_norm": 0.05687154084444046, "kl": 0.0, "learning_rate": 2.2481021394064871e-07, "loss": 0.0002, "num_tokens": 52694278.0, "reward": 1.2166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298573225736618, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1646 }, { "completion_length": 626.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1213.0, "completions/max_terminated_length": 1213.0, "completions/mean_length": 626.1666870117188, "completions/mean_terminated_length": 626.1666870117188, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 0.5586838534599728, "frac_reward_zero_std": 1.0, "grad_norm": 1.2539709359771223e-07, "kl": 0.0, "learning_rate": 2.2463768115942027e-07, "loss": 0.0, "num_tokens": 52712202.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1647 }, { "completion_length": 1978.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 4506.0, "completions/max_terminated_length": 4506.0, "completions/mean_length": 1978.25, "completions/mean_terminated_length": 1978.25, "completions/min_length": 395.0, "completions/min_terminated_length": 395.0, "epoch": 0.5590230664857531, "frac_reward_zero_std": 1.0, "grad_norm": 1.3730266346101416e-07, "kl": 0.0, "learning_rate": 2.2446514837819187e-07, "loss": 0.0, "num_tokens": 52747227.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1648 }, { "completion_length": 722.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 996.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 722.75, "completions/mean_terminated_length": 722.75, "completions/min_length": 553.0, "completions/min_terminated_length": 553.0, "epoch": 0.5593622795115333, "frac_reward_zero_std": 0.5, "grad_norm": 0.09186825901269913, "kl": 0.0, "learning_rate": 2.2429261559696343e-07, "loss": 0.0001, "num_tokens": 52767690.0, "reward": 1.25, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1649 }, { "completion_length": 901.0000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 2569.0, "completions/max_terminated_length": 2569.0, "completions/mean_length": 901.0, "completions/mean_terminated_length": 901.0, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 0.5597014925373134, "frac_reward_zero_std": 0.5, "grad_norm": 0.07321745157241821, "kl": 0.0, "learning_rate": 2.2412008281573498e-07, "loss": 0.0002, "num_tokens": 52786974.0, "reward": 1.2166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1650 }, { "completion_length": 1569.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2369.0, "completions/max_terminated_length": 2369.0, "completions/mean_length": 1569.0, "completions/mean_terminated_length": 1569.0, "completions/min_length": 722.0, "completions/min_terminated_length": 722.0, "epoch": 0.5600407055630936, "frac_reward_zero_std": 0.5, "grad_norm": 0.11625803261995316, "kl": 0.0, "learning_rate": 2.2394755003450656e-07, "loss": 0.0005, "num_tokens": 52816494.0, "reward": 0.7333333492279053, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.43333330750465393, "rewards/correctness_reward_func/std": 0.45792683959007263, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1651 }, { "completion_length": 998.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2158.0, "completions/max_terminated_length": 2158.0, "completions/mean_length": 998.25, "completions/mean_terminated_length": 998.25, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.5603799185888738, "frac_reward_zero_std": 0.0, "grad_norm": 0.4001650810241699, "kl": 0.0, "learning_rate": 2.237750172532781e-07, "loss": 0.002, "num_tokens": 52840095.0, "reward": 1.133333444595337, "reward_std": 0.24494896829128265, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.2806917726993561, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1652 }, { "completion_length": 2477.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 6308.0, "completions/max_terminated_length": 6308.0, "completions/mean_length": 2477.5, "completions/mean_terminated_length": 2477.5, "completions/min_length": 1429.0, "completions/min_terminated_length": 1429.0, "epoch": 0.560719131614654, "frac_reward_zero_std": 0.5, "grad_norm": 0.51358562707901, "kl": 0.0, "learning_rate": 2.236024844720497e-07, "loss": 0.0099, "num_tokens": 52878285.0, "reward": 0.8833333849906921, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.5833333134651184, "rewards/correctness_reward_func/std": 0.5149286389350891, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1653 }, { "completion_length": 617.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1186.0, "completions/max_terminated_length": 1186.0, "completions/mean_length": 617.3333740234375, "completions/mean_terminated_length": 617.3333740234375, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.5610583446404342, "frac_reward_zero_std": 0.5, "grad_norm": 0.07032569497823715, "kl": 0.0, "learning_rate": 2.2342995169082124e-07, "loss": -0.0006, "num_tokens": 52896487.0, "reward": 0.7666666507720947, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1654 }, { "completion_length": 2089.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5167.0, "completions/max_terminated_length": 5167.0, "completions/mean_length": 2089.75, "completions/mean_terminated_length": 2089.75, "completions/min_length": 487.0, "completions/min_terminated_length": 487.0, "epoch": 0.5613975576662144, "frac_reward_zero_std": 0.5, "grad_norm": 0.07167855650186539, "kl": 0.0, "learning_rate": 2.2325741890959282e-07, "loss": 0.0007, "num_tokens": 52933774.0, "reward": 1.133333444595337, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1655 }, { "completion_length": 1377.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2003.0, "completions/max_terminated_length": 2003.0, "completions/mean_length": 1377.5, "completions/mean_terminated_length": 1377.5, "completions/min_length": 787.0, "completions/min_terminated_length": 787.0, "epoch": 0.5617367706919946, "frac_reward_zero_std": 0.5, "grad_norm": 0.09533493965864182, "kl": 0.0, "learning_rate": 2.2308488612836437e-07, "loss": -0.0008, "num_tokens": 52966330.0, "reward": 1.1666667461395264, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1656 }, { "completion_length": 1370.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3510.0, "completions/max_terminated_length": 3510.0, "completions/mean_length": 1370.25, "completions/mean_terminated_length": 1370.25, "completions/min_length": 699.0, "completions/min_terminated_length": 699.0, "epoch": 0.5620759837177748, "frac_reward_zero_std": 1.0, "grad_norm": 2.037649409203368e-07, "kl": 0.0, "learning_rate": 2.2291235334713595e-07, "loss": 0.0, "num_tokens": 52997167.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1657 }, { "completion_length": 1574.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2481.0, "completions/max_terminated_length": 2481.0, "completions/mean_length": 1574.5, "completions/mean_terminated_length": 1574.5, "completions/min_length": 733.0, "completions/min_terminated_length": 733.0, "epoch": 0.562415196743555, "frac_reward_zero_std": 0.5, "grad_norm": 0.12863564491271973, "kl": 0.0, "learning_rate": 2.227398205659075e-07, "loss": 0.0013, "num_tokens": 53027797.0, "reward": 0.7666666507720947, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1658 }, { "completion_length": 417.4166717529297, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/max_terminated_length": 544.0, "completions/mean_length": 417.41668701171875, "completions/mean_terminated_length": 417.41668701171875, "completions/min_length": 340.0, "completions/min_terminated_length": 340.0, "epoch": 0.5627544097693351, "frac_reward_zero_std": 1.0, "grad_norm": 1.0332446720440203e-07, "kl": 0.0, "learning_rate": 2.2256728778467909e-07, "loss": 0.0, "num_tokens": 53041920.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1659 }, { "completion_length": 1103.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2381.0, "completions/max_terminated_length": 2381.0, "completions/mean_length": 1103.75, "completions/mean_terminated_length": 1103.75, "completions/min_length": 609.0, "completions/min_terminated_length": 609.0, "epoch": 0.5630936227951153, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.2239475500345064e-07, "loss": 0.0, "num_tokens": 53068059.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1660 }, { "completion_length": 756.5833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1396.0, "completions/max_terminated_length": 1396.0, "completions/mean_length": 756.5833740234375, "completions/mean_terminated_length": 756.5833740234375, "completions/min_length": 376.0, "completions/min_terminated_length": 376.0, "epoch": 0.5634328358208955, "frac_reward_zero_std": 0.5, "grad_norm": 0.047998420894145966, "kl": 0.0, "learning_rate": 2.222222222222222e-07, "loss": -0.0002, "num_tokens": 53088160.0, "reward": 1.1625001430511475, "reward_std": 0.04107918590307236, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.06784005463123322, "step": 1661 }, { "completion_length": 966.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3906.0, "completions/max_terminated_length": 3906.0, "completions/mean_length": 966.75, "completions/mean_terminated_length": 966.75, "completions/min_length": 377.0, "completions/min_terminated_length": 377.0, "epoch": 0.5637720488466758, "frac_reward_zero_std": 0.5, "grad_norm": 0.10353852063417435, "kl": 0.0, "learning_rate": 2.2204968944099377e-07, "loss": -0.0063, "num_tokens": 53111707.0, "reward": 0.7166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1662 }, { "completion_length": 1330.0833740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 2989.0, "completions/mean_length": 1879.166748046875, "completions/mean_terminated_length": 1451.0, "completions/min_length": 704.0, "completions/min_terminated_length": 704.0, "epoch": 0.5641112618724559, "frac_reward_zero_std": 0.0, "grad_norm": 0.2819819450378418, "kl": NaN, "learning_rate": 2.2187715665976532e-07, "loss": -0.0294, "num_tokens": 53139626.0, "reward": 1.1083333492279053, "reward_std": 0.29743778705596924, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.28069180250167847, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 1663 }, { "completion_length": 1064.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1592.0, "completions/max_terminated_length": 1592.0, "completions/mean_length": 1064.0, "completions/mean_terminated_length": 1064.0, "completions/min_length": 588.0, "completions/min_terminated_length": 588.0, "epoch": 0.5644504748982361, "frac_reward_zero_std": 1.0, "grad_norm": 1.9489071689804405e-07, "kl": 0.0, "learning_rate": 2.2170462387853693e-07, "loss": 0.0, "num_tokens": 53163050.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1664 }, { "completion_length": 2161.33349609375, "completions/clipped_ratio": 0.0, "completions/max_length": 4035.0, "completions/max_terminated_length": 4035.0, "completions/mean_length": 2161.33349609375, "completions/mean_terminated_length": 2161.33349609375, "completions/min_length": 512.0, "completions/min_terminated_length": 512.0, "epoch": 0.5647896879240163, "frac_reward_zero_std": 1.0, "grad_norm": 1.4657211977464613e-07, "kl": 0.0, "learning_rate": 2.2153209109730848e-07, "loss": 0.0, "num_tokens": 53200482.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1665 }, { "completion_length": 477.75, "completions/clipped_ratio": 0.5, "completions/max_length": 6589.0, "completions/max_terminated_length": 1407.0, "completions/mean_length": 3772.25, "completions/mean_terminated_length": 955.5, "completions/min_length": 624.0, "completions/min_terminated_length": 624.0, "epoch": 0.5651289009497965, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": NaN, "learning_rate": 2.2135955831608006e-07, "loss": 0.0, "num_tokens": 53217489.0, "reward": 0.15000000596046448, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.15000000596046448, "rewards/format_reward_func/std": 0.15666989982128143, "step": 1666 }, { "completion_length": 1988.8334350585938, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 6318.0, "completions/mean_length": 3087.0, "completions/mean_terminated_length": 2386.60009765625, "completions/min_length": 547.0, "completions/min_terminated_length": 547.0, "epoch": 0.5654681139755766, "frac_reward_zero_std": 0.5, "grad_norm": 1.028896450996399, "kl": NaN, "learning_rate": 2.2118702553485161e-07, "loss": -0.0907, "num_tokens": 53256115.0, "reward": 0.9791667461395264, "reward_std": 0.30676400661468506, "rewards/correctness_reward_func/mean": 0.7166666984558105, "rewards/correctness_reward_func/std": 0.3459725081920624, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.09323723614215851, "step": 1667 }, { "completion_length": 852.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2225.0, "completions/max_terminated_length": 2225.0, "completions/mean_length": 852.6666870117188, "completions/mean_terminated_length": 852.6666870117188, "completions/min_length": 407.0, "completions/min_terminated_length": 407.0, "epoch": 0.5658073270013568, "frac_reward_zero_std": 1.0, "grad_norm": 1.5932769770188315e-07, "kl": 0.0, "learning_rate": 2.210144927536232e-07, "loss": 0.0, "num_tokens": 53273187.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1668 }, { "completion_length": 501.16668701171875, "completions/clipped_ratio": 0.0, "completions/max_length": 661.0, "completions/max_terminated_length": 661.0, "completions/mean_length": 501.16668701171875, "completions/mean_terminated_length": 501.16668701171875, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "epoch": 0.566146540027137, "frac_reward_zero_std": 1.0, "grad_norm": 1.347583520328044e-07, "kl": 0.0, "learning_rate": 2.2084195997239475e-07, "loss": 0.0, "num_tokens": 53292119.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1669 }, { "completion_length": 1217.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2258.0, "completions/max_terminated_length": 2258.0, "completions/mean_length": 1217.166748046875, "completions/mean_terminated_length": 1217.166748046875, "completions/min_length": 517.0, "completions/min_terminated_length": 517.0, "epoch": 0.5664857530529173, "frac_reward_zero_std": 0.5, "grad_norm": 0.5359957218170166, "kl": 0.0, "learning_rate": 2.2066942719116633e-07, "loss": -0.012, "num_tokens": 53317279.0, "reward": 0.949999988079071, "reward_std": 0.2345207780599594, "rewards/correctness_reward_func/mean": 0.6500000357627869, "rewards/correctness_reward_func/std": 0.4833594560623169, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1670 }, { "completion_length": 555.75, "completions/clipped_ratio": 0.0, "completions/max_length": 701.0, "completions/max_terminated_length": 701.0, "completions/mean_length": 555.75, "completions/mean_terminated_length": 555.75, "completions/min_length": 418.0, "completions/min_terminated_length": 418.0, "epoch": 0.5668249660786974, "frac_reward_zero_std": 1.0, "grad_norm": 1.149532309341339e-07, "kl": 0.0, "learning_rate": 2.2049689440993788e-07, "loss": 0.0, "num_tokens": 53335240.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1671 }, { "completion_length": 1385.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2809.0, "completions/max_terminated_length": 2809.0, "completions/mean_length": 1385.25, "completions/mean_terminated_length": 1385.25, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 0.5671641791044776, "frac_reward_zero_std": 0.5, "grad_norm": 0.05244852229952812, "kl": 0.0, "learning_rate": 2.2032436162870943e-07, "loss": 0.0028, "num_tokens": 53367691.0, "reward": 0.7166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1672 }, { "completion_length": 830.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1381.0, "completions/max_terminated_length": 1381.0, "completions/mean_length": 830.3333740234375, "completions/mean_terminated_length": 830.3333740234375, "completions/min_length": 428.0, "completions/min_terminated_length": 428.0, "epoch": 0.5675033921302578, "frac_reward_zero_std": 1.0, "grad_norm": 1.4212218957254663e-07, "kl": 0.0, "learning_rate": 2.20151828847481e-07, "loss": 0.0, "num_tokens": 53392265.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1673 }, { "completion_length": 1794.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4179.0, "completions/max_terminated_length": 4179.0, "completions/mean_length": 1794.5833740234375, "completions/mean_terminated_length": 1794.5833740234375, "completions/min_length": 623.0, "completions/min_terminated_length": 623.0, "epoch": 0.567842605156038, "frac_reward_zero_std": 0.5, "grad_norm": 0.12209474295377731, "kl": 0.0, "learning_rate": 2.1997929606625256e-07, "loss": -0.004, "num_tokens": 53426544.0, "reward": 1.2333333492279053, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.9333333373069763, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1674 }, { "completion_length": 1031.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1509.0, "completions/max_terminated_length": 1509.0, "completions/mean_length": 1031.8333740234375, "completions/mean_terminated_length": 1031.8333740234375, "completions/min_length": 442.0, "completions/min_terminated_length": 442.0, "epoch": 0.5681818181818182, "frac_reward_zero_std": 0.0, "grad_norm": 0.12974810600280762, "kl": 0.0, "learning_rate": 2.1980676328502414e-07, "loss": -0.0019, "num_tokens": 53445814.0, "reward": 1.183333396911621, "reward_std": 0.10641200840473175, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1675 }, { "completion_length": 1086.0000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1792.0, "completions/max_terminated_length": 1792.0, "completions/mean_length": 1086.0, "completions/mean_terminated_length": 1086.0, "completions/min_length": 533.0, "completions/min_terminated_length": 533.0, "epoch": 0.5685210312075983, "frac_reward_zero_std": 0.5, "grad_norm": 0.06776907294988632, "kl": 0.0, "learning_rate": 2.196342305037957e-07, "loss": -0.001, "num_tokens": 53473726.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1676 }, { "completion_length": 1702.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3419.0, "completions/max_terminated_length": 3419.0, "completions/mean_length": 1702.8333740234375, "completions/mean_terminated_length": 1702.8333740234375, "completions/min_length": 631.0, "completions/min_terminated_length": 631.0, "epoch": 0.5688602442333786, "frac_reward_zero_std": 0.0, "grad_norm": 0.14789369702339172, "kl": 0.0, "learning_rate": 2.1946169772256728e-07, "loss": -0.0048, "num_tokens": 53507270.0, "reward": 1.183333396911621, "reward_std": 0.09246458858251572, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1677 }, { "completion_length": 886.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1562.0, "completions/max_terminated_length": 1562.0, "completions/mean_length": 886.6666870117188, "completions/mean_terminated_length": 886.6666870117188, "completions/min_length": 470.0, "completions/min_terminated_length": 470.0, "epoch": 0.5691994572591588, "frac_reward_zero_std": 1.0, "grad_norm": 1.6787380729965662e-07, "kl": 0.0, "learning_rate": 2.1928916494133883e-07, "loss": 0.0, "num_tokens": 53528398.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1678 }, { "completion_length": 1015.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2394.0, "completions/max_terminated_length": 2394.0, "completions/mean_length": 1015.75, "completions/mean_terminated_length": 1015.75, "completions/min_length": 614.0, "completions/min_terminated_length": 614.0, "epoch": 0.569538670284939, "frac_reward_zero_std": 0.5, "grad_norm": 0.1084146797657013, "kl": 0.0, "learning_rate": 2.1911663216011043e-07, "loss": 0.0023, "num_tokens": 53550391.0, "reward": 1.2666666507720947, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1679 }, { "completion_length": 683.0000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1119.0, "completions/max_terminated_length": 1119.0, "completions/mean_length": 683.0, "completions/mean_terminated_length": 683.0, "completions/min_length": 394.0, "completions/min_terminated_length": 394.0, "epoch": 0.5698778833107191, "frac_reward_zero_std": 1.0, "grad_norm": 1.703596126390039e-07, "kl": 0.0, "learning_rate": 2.1894409937888199e-07, "loss": 0.0, "num_tokens": 53575525.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1680 }, { "completion_length": 1548.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2303.0, "completions/max_terminated_length": 2303.0, "completions/mean_length": 1548.5, "completions/mean_terminated_length": 1548.5, "completions/min_length": 490.0, "completions/min_terminated_length": 490.0, "epoch": 0.5702170963364993, "frac_reward_zero_std": 0.5, "grad_norm": 0.087087482213974, "kl": 0.0, "learning_rate": 2.1877156659765357e-07, "loss": -0.0009, "num_tokens": 53605525.0, "reward": 1.2166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298573225736618, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1681 }, { "completion_length": 662.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 805.0, "completions/max_terminated_length": 805.0, "completions/mean_length": 662.3333740234375, "completions/mean_terminated_length": 662.3333740234375, "completions/min_length": 525.0, "completions/min_terminated_length": 525.0, "epoch": 0.5705563093622795, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.1859903381642512e-07, "loss": 0.0, "num_tokens": 53627555.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1682 }, { "completion_length": 1243.5833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 3242.0, "completions/max_terminated_length": 3242.0, "completions/mean_length": 1243.5833740234375, "completions/mean_terminated_length": 1243.5833740234375, "completions/min_length": 592.0, "completions/min_terminated_length": 592.0, "epoch": 0.5708955223880597, "frac_reward_zero_std": 1.0, "grad_norm": 9.607090589724976e-08, "kl": 0.0, "learning_rate": 2.1842650103519667e-07, "loss": 0.0, "num_tokens": 53655858.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1683 }, { "completion_length": 783.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1525.0, "completions/max_terminated_length": 1525.0, "completions/mean_length": 783.8333740234375, "completions/mean_terminated_length": 783.8333740234375, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "epoch": 0.5712347354138398, "frac_reward_zero_std": 0.5, "grad_norm": 0.06900150328874588, "kl": 0.0, "learning_rate": 2.1825396825396825e-07, "loss": 0.0, "num_tokens": 53673994.0, "reward": 0.75, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.45000001788139343, "rewards/correctness_reward_func/std": 0.47577688097953796, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1684 }, { "completion_length": 1832.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3883.0, "completions/max_terminated_length": 3883.0, "completions/mean_length": 1832.416748046875, "completions/mean_terminated_length": 1832.416748046875, "completions/min_length": 897.0, "completions/min_terminated_length": 897.0, "epoch": 0.5715739484396201, "frac_reward_zero_std": 0.0, "grad_norm": 0.4185093343257904, "kl": 0.0, "learning_rate": 2.180814354727398e-07, "loss": -0.0068, "num_tokens": 53706237.0, "reward": 0.8000000715255737, "reward_std": 0.21493908762931824, "rewards/correctness_reward_func/mean": 0.4999999701976776, "rewards/correctness_reward_func/std": 0.4472135901451111, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1685 }, { "completion_length": 877.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2194.0, "completions/max_terminated_length": 2194.0, "completions/mean_length": 877.8333740234375, "completions/mean_terminated_length": 877.8333740234375, "completions/min_length": 394.0, "completions/min_terminated_length": 394.0, "epoch": 0.5719131614654003, "frac_reward_zero_std": 0.5, "grad_norm": 0.4576956629753113, "kl": 0.0, "learning_rate": 2.1790890269151138e-07, "loss": -0.0133, "num_tokens": 53728237.0, "reward": 0.48750004172325134, "reward_std": 0.23438750207424164, "rewards/correctness_reward_func/mean": 0.20000000298023224, "rewards/correctness_reward_func/std": 0.36181363463401794, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 1686 }, { "completion_length": 3718.0833740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5134.0, "completions/mean_length": 4267.1669921875, "completions/mean_terminated_length": 4056.091064453125, "completions/min_length": 3271.0, "completions/min_terminated_length": 3271.0, "epoch": 0.5722523744911805, "frac_reward_zero_std": 0.5, "grad_norm": 0.3341676592826843, "kl": NaN, "learning_rate": 2.1773636991028294e-07, "loss": -0.052, "num_tokens": 53783978.0, "reward": 0.6916667819023132, "reward_std": 0.26536139845848083, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.5149286985397339, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 1687 }, { "completion_length": 1090.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 2017.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 1090.8333740234375, "completions/mean_terminated_length": 1090.8333740234375, "completions/min_length": 494.0, "completions/min_terminated_length": 494.0, "epoch": 0.5725915875169606, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.1756383712905452e-07, "loss": 0.0, "num_tokens": 53806950.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1688 }, { "completion_length": 1972.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3749.0, "completions/max_terminated_length": 3749.0, "completions/mean_length": 1972.416748046875, "completions/mean_terminated_length": 1972.416748046875, "completions/min_length": 926.0, "completions/min_terminated_length": 926.0, "epoch": 0.5729308005427408, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.1739130434782607e-07, "loss": 0.0, "num_tokens": 53845847.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1689 }, { "completion_length": 669.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 904.0, "completions/max_terminated_length": 904.0, "completions/mean_length": 669.6666870117188, "completions/mean_terminated_length": 669.6666870117188, "completions/min_length": 436.0, "completions/min_terminated_length": 436.0, "epoch": 0.573270013568521, "frac_reward_zero_std": 1.0, "grad_norm": 8.028143128058218e-08, "kl": 0.0, "learning_rate": 2.1721877156659765e-07, "loss": 0.0, "num_tokens": 53869393.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1690 }, { "completion_length": 1146.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2217.0, "completions/max_terminated_length": 2217.0, "completions/mean_length": 1146.166748046875, "completions/mean_terminated_length": 1146.166748046875, "completions/min_length": 511.0, "completions/min_terminated_length": 511.0, "epoch": 0.5736092265943012, "frac_reward_zero_std": 0.5, "grad_norm": 0.08629421889781952, "kl": 0.0, "learning_rate": 2.170462387853692e-07, "loss": 0.0005, "num_tokens": 53899083.0, "reward": 1.2666666507720947, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1691 }, { "completion_length": 1341.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1900.0, "completions/max_terminated_length": 1900.0, "completions/mean_length": 1341.0833740234375, "completions/mean_terminated_length": 1341.0833740234375, "completions/min_length": 716.0, "completions/min_terminated_length": 716.0, "epoch": 0.5739484396200815, "frac_reward_zero_std": 0.0, "grad_norm": 0.3742624521255493, "kl": 0.0, "learning_rate": 2.1687370600414078e-07, "loss": 0.0024, "num_tokens": 53927452.0, "reward": 1.1500000953674316, "reward_std": 0.2270146608352661, "rewards/correctness_reward_func/mean": 0.8499999642372131, "rewards/correctness_reward_func/std": 0.28444522619247437, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1692 }, { "completion_length": 2808.5001220703125, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 5159.0, "completions/mean_length": 4455.75, "completions/mean_terminated_length": 3744.666748046875, "completions/min_length": 1109.0, "completions/min_terminated_length": 1109.0, "epoch": 0.5742876526458616, "frac_reward_zero_std": 0.0, "grad_norm": 1.5461046695709229, "kl": NaN, "learning_rate": 2.1670117322291233e-07, "loss": -0.1021, "num_tokens": 53972764.0, "reward": 0.6000000238418579, "reward_std": 0.5022222399711609, "rewards/correctness_reward_func/mean": 0.3500000238418579, "rewards/correctness_reward_func/std": 0.4358898997306824, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.09770084172487259, "step": 1693 }, { "completion_length": 2537.2501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 6296.0, "completions/max_terminated_length": 6296.0, "completions/mean_length": 2537.25, "completions/mean_terminated_length": 2537.25, "completions/min_length": 928.0, "completions/min_terminated_length": 928.0, "epoch": 0.5746268656716418, "frac_reward_zero_std": 0.5, "grad_norm": 0.13768364489078522, "kl": 0.0, "learning_rate": 2.1652864044168389e-07, "loss": -0.0027, "num_tokens": 54016231.0, "reward": 1.1500000953674316, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.8500000834465027, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1694 }, { "completion_length": 2229.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3939.0, "completions/max_terminated_length": 3939.0, "completions/mean_length": 2229.666748046875, "completions/mean_terminated_length": 2229.666748046875, "completions/min_length": 564.0, "completions/min_terminated_length": 564.0, "epoch": 0.574966078697422, "frac_reward_zero_std": 1.0, "grad_norm": 9.53095948830196e-08, "kl": 0.0, "learning_rate": 2.163561076604555e-07, "loss": 0.0, "num_tokens": 54059301.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1695 }, { "completion_length": 766.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1699.0, "completions/max_terminated_length": 1699.0, "completions/mean_length": 766.25, "completions/mean_terminated_length": 766.25, "completions/min_length": 418.0, "completions/min_terminated_length": 418.0, "epoch": 0.5753052917232022, "frac_reward_zero_std": 0.5, "grad_norm": 0.045238472521305084, "kl": 0.0, "learning_rate": 2.1618357487922704e-07, "loss": -0.001, "num_tokens": 54081534.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1696 }, { "completion_length": 1092.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 1861.0, "completions/max_terminated_length": 1861.0, "completions/mean_length": 1092.916748046875, "completions/mean_terminated_length": 1092.916748046875, "completions/min_length": 472.0, "completions/min_terminated_length": 472.0, "epoch": 0.5756445047489823, "frac_reward_zero_std": 1.0, "grad_norm": 2.0555006585709634e-07, "kl": 0.0, "learning_rate": 2.1601104209799862e-07, "loss": 0.0, "num_tokens": 54108995.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1697 }, { "completion_length": 2253.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 6310.0, "completions/max_terminated_length": 6310.0, "completions/mean_length": 2253.666748046875, "completions/mean_terminated_length": 2253.666748046875, "completions/min_length": 383.0, "completions/min_terminated_length": 383.0, "epoch": 0.5759837177747625, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.1583850931677018e-07, "loss": 0.0, "num_tokens": 54148075.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1698 }, { "completion_length": 1661.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3301.0, "completions/max_terminated_length": 3301.0, "completions/mean_length": 1661.0833740234375, "completions/mean_terminated_length": 1661.0833740234375, "completions/min_length": 621.0, "completions/min_terminated_length": 621.0, "epoch": 0.5763229308005428, "frac_reward_zero_std": 1.0, "grad_norm": 2.528492188957898e-07, "kl": 0.0, "learning_rate": 2.1566597653554176e-07, "loss": 0.0, "num_tokens": 54177896.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1699 }, { "completion_length": 1622.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3528.0, "completions/max_terminated_length": 3528.0, "completions/mean_length": 1622.0833740234375, "completions/mean_terminated_length": 1622.0833740234375, "completions/min_length": 366.0, "completions/min_terminated_length": 366.0, "epoch": 0.576662143826323, "frac_reward_zero_std": 0.5, "grad_norm": 0.12352310866117477, "kl": 0.0, "learning_rate": 2.154934437543133e-07, "loss": 0.0002, "num_tokens": 54210393.0, "reward": 0.7666666507720947, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1700 }, { "completion_length": 2118.8334350585938, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 6074.0, "completions/mean_length": 3217.0, "completions/mean_terminated_length": 2542.60009765625, "completions/min_length": 713.0, "completions/min_terminated_length": 713.0, "epoch": 0.5770013568521031, "frac_reward_zero_std": 0.5, "grad_norm": 0.09835956990718842, "kl": NaN, "learning_rate": 2.153209109730849e-07, "loss": -0.0173, "num_tokens": 54247711.0, "reward": 0.6625000834465027, "reward_std": 0.06274950504302979, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.09323723614215851, "step": 1701 }, { "completion_length": 1278.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2619.0, "completions/max_terminated_length": 2619.0, "completions/mean_length": 1278.0833740234375, "completions/mean_terminated_length": 1278.0833740234375, "completions/min_length": 531.0, "completions/min_terminated_length": 531.0, "epoch": 0.5773405698778833, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.1514837819185644e-07, "loss": 0.0, "num_tokens": 54278420.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1702 }, { "completion_length": 1153.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1847.0, "completions/max_terminated_length": 1847.0, "completions/mean_length": 1153.8333740234375, "completions/mean_terminated_length": 1153.8333740234375, "completions/min_length": 577.0, "completions/min_terminated_length": 577.0, "epoch": 0.5776797829036635, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.1497584541062802e-07, "loss": 0.0, "num_tokens": 54302508.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1703 }, { "completion_length": 2199.3334350585938, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5649.0, "completions/mean_length": 2748.416748046875, "completions/mean_terminated_length": 2399.272705078125, "completions/min_length": 433.0, "completions/min_terminated_length": 433.0, "epoch": 0.5780189959294437, "frac_reward_zero_std": 0.5, "grad_norm": 0.4652014970779419, "kl": NaN, "learning_rate": 2.1480331262939957e-07, "loss": -0.0623, "num_tokens": 54337666.0, "reward": 1.191666603088379, "reward_std": 0.26536139845848083, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 1704 }, { "completion_length": 1180.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 1754.0, "completions/max_terminated_length": 1754.0, "completions/mean_length": 1180.25, "completions/mean_terminated_length": 1180.25, "completions/min_length": 430.0, "completions/min_terminated_length": 430.0, "epoch": 0.5783582089552238, "frac_reward_zero_std": 1.0, "grad_norm": 1.6632176880193583e-07, "kl": 0.0, "learning_rate": 2.1463077984817115e-07, "loss": 0.0, "num_tokens": 54364363.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1705 }, { "completion_length": 1127.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2314.0, "completions/max_terminated_length": 2314.0, "completions/mean_length": 1127.416748046875, "completions/mean_terminated_length": 1127.416748046875, "completions/min_length": 616.0, "completions/min_terminated_length": 616.0, "epoch": 0.578697421981004, "frac_reward_zero_std": 0.0, "grad_norm": 0.4303831458091736, "kl": 0.0, "learning_rate": 2.144582470669427e-07, "loss": -0.0092, "num_tokens": 54387060.0, "reward": 1.133333444595337, "reward_std": 0.24494896829128265, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.2806917726993561, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1706 }, { "completion_length": 1707.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 4935.0, "completions/max_terminated_length": 4935.0, "completions/mean_length": 1707.75, "completions/mean_terminated_length": 1707.75, "completions/min_length": 610.0, "completions/min_terminated_length": 610.0, "epoch": 0.5790366350067843, "frac_reward_zero_std": 0.5, "grad_norm": 0.10593991726636887, "kl": 0.0, "learning_rate": 2.1428571428571426e-07, "loss": -0.0044, "num_tokens": 54416199.0, "reward": 0.7666666507720947, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1707 }, { "completion_length": 571.8333587646484, "completions/clipped_ratio": 0.0, "completions/max_length": 997.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 571.8333740234375, "completions/mean_terminated_length": 571.8333740234375, "completions/min_length": 396.0, "completions/min_terminated_length": 396.0, "epoch": 0.5793758480325645, "frac_reward_zero_std": 0.0, "grad_norm": 0.08911672234535217, "kl": 0.0, "learning_rate": 2.1411318150448584e-07, "loss": 0.0001, "num_tokens": 54433705.0, "reward": 1.133333444595337, "reward_std": 0.0869503766298294, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.07385490089654922, "step": 1708 }, { "completion_length": 1131.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2250.0, "completions/max_terminated_length": 2250.0, "completions/mean_length": 1131.8333740234375, "completions/mean_terminated_length": 1131.8333740234375, "completions/min_length": 707.0, "completions/min_terminated_length": 707.0, "epoch": 0.5797150610583447, "frac_reward_zero_std": 0.5, "grad_norm": 0.10156305879354477, "kl": 0.0, "learning_rate": 2.139406487232574e-07, "loss": 0.001, "num_tokens": 54461909.0, "reward": 1.25, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1709 }, { "completion_length": 506.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 681.0, "completions/max_terminated_length": 681.0, "completions/mean_length": 506.75, "completions/mean_terminated_length": 506.75, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.5800542740841248, "frac_reward_zero_std": 1.0, "grad_norm": 7.83178535357365e-08, "kl": 0.0, "learning_rate": 2.1376811594202897e-07, "loss": 0.0, "num_tokens": 54480938.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1710 }, { "completion_length": 752.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2180.0, "completions/max_terminated_length": 2180.0, "completions/mean_length": 752.5, "completions/mean_terminated_length": 752.5, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 0.580393487109905, "frac_reward_zero_std": 1.0, "grad_norm": 1.4247483193230437e-07, "kl": 0.0, "learning_rate": 2.1359558316080055e-07, "loss": 0.0, "num_tokens": 54503846.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1711 }, { "completion_length": 1338.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3320.0, "completions/max_terminated_length": 3320.0, "completions/mean_length": 1338.5833740234375, "completions/mean_terminated_length": 1338.5833740234375, "completions/min_length": 649.0, "completions/min_terminated_length": 649.0, "epoch": 0.5807327001356852, "frac_reward_zero_std": 0.5, "grad_norm": 0.06322566419839859, "kl": 0.0, "learning_rate": 2.1342305037957213e-07, "loss": -0.0002, "num_tokens": 54533439.0, "reward": 0.7833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1712 }, { "completion_length": 2574.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5589.0, "completions/max_terminated_length": 5589.0, "completions/mean_length": 2574.166748046875, "completions/mean_terminated_length": 2574.166748046875, "completions/min_length": 1186.0, "completions/min_terminated_length": 1186.0, "epoch": 0.5810719131614654, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.1325051759834368e-07, "loss": 0.0, "num_tokens": 54575867.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1713 }, { "completion_length": 628.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1225.0, "completions/max_terminated_length": 1225.0, "completions/mean_length": 628.25, "completions/mean_terminated_length": 628.25, "completions/min_length": 362.0, "completions/min_terminated_length": 362.0, "epoch": 0.5814111261872456, "frac_reward_zero_std": 0.0, "grad_norm": 0.07242586463689804, "kl": 0.0, "learning_rate": 2.1307798481711526e-07, "loss": 0.0006, "num_tokens": 54593804.0, "reward": 1.1500000953674316, "reward_std": 0.09246458858251572, "rewards/correctness_reward_func/mean": 0.8500000834465027, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1714 }, { "completion_length": 1377.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 4987.0, "completions/max_terminated_length": 4987.0, "completions/mean_length": 1377.166748046875, "completions/mean_terminated_length": 1377.166748046875, "completions/min_length": 511.0, "completions/min_terminated_length": 511.0, "epoch": 0.5817503392130258, "frac_reward_zero_std": 0.5, "grad_norm": 0.06162019073963165, "kl": 0.0, "learning_rate": 2.129054520358868e-07, "loss": 0.0003, "num_tokens": 54621472.0, "reward": 0.7333333492279053, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.43333330750465393, "rewards/correctness_reward_func/std": 0.45792683959007263, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1715 }, { "completion_length": 1050.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1320.0, "completions/max_terminated_length": 1320.0, "completions/mean_length": 1050.3333740234375, "completions/mean_terminated_length": 1050.3333740234375, "completions/min_length": 792.0, "completions/min_terminated_length": 792.0, "epoch": 0.582089552238806, "frac_reward_zero_std": 0.5, "grad_norm": 0.08028385043144226, "kl": 0.0, "learning_rate": 2.127329192546584e-07, "loss": -0.0001, "num_tokens": 54647234.0, "reward": 1.2333333492279053, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.9333333373069763, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1716 }, { "completion_length": 1328.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3165.0, "completions/max_terminated_length": 3165.0, "completions/mean_length": 1328.0833740234375, "completions/mean_terminated_length": 1328.0833740234375, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.5824287652645862, "frac_reward_zero_std": 1.0, "grad_norm": 1.0510674286479116e-07, "kl": 0.0, "learning_rate": 2.1256038647342994e-07, "loss": 0.0, "num_tokens": 54674757.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1717 }, { "completion_length": 866.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1780.0, "completions/max_terminated_length": 1780.0, "completions/mean_length": 866.6666870117188, "completions/mean_terminated_length": 866.6666870117188, "completions/min_length": 560.0, "completions/min_terminated_length": 560.0, "epoch": 0.5827679782903663, "frac_reward_zero_std": 0.5, "grad_norm": 0.061516404151916504, "kl": 0.0, "learning_rate": 2.123878536922015e-07, "loss": -0.0004, "num_tokens": 54694949.0, "reward": 0.7166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1718 }, { "completion_length": 1044.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1446.0, "completions/max_terminated_length": 1446.0, "completions/mean_length": 1044.0833740234375, "completions/mean_terminated_length": 1044.0833740234375, "completions/min_length": 591.0, "completions/min_terminated_length": 591.0, "epoch": 0.5831071913161465, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.1221532091097308e-07, "loss": 0.0, "num_tokens": 54719970.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1719 }, { "completion_length": 2515.416748046875, "completions/clipped_ratio": 0.41666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 5790.0, "completions/mean_length": 5260.83349609375, "completions/mean_terminated_length": 4312.14306640625, "completions/min_length": 1192.0, "completions/min_terminated_length": 1192.0, "epoch": 0.5834464043419267, "frac_reward_zero_std": 0.0, "grad_norm": 0.41656506061553955, "kl": NaN, "learning_rate": 2.1204278812974463e-07, "loss": -0.0724, "num_tokens": 54760631.0, "reward": 0.5083333253860474, "reward_std": 0.30199623107910156, "rewards/correctness_reward_func/mean": 0.3333333432674408, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.17499999701976776, "rewards/format_reward_func/std": 0.15447859466075897, "step": 1720 }, { "completion_length": 516.0000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 671.0, "completions/max_terminated_length": 671.0, "completions/mean_length": 516.0, "completions/mean_terminated_length": 516.0, "completions/min_length": 379.0, "completions/min_terminated_length": 379.0, "epoch": 0.583785617367707, "frac_reward_zero_std": 0.5, "grad_norm": 0.03805525600910187, "kl": 0.0, "learning_rate": 2.118702553485162e-07, "loss": -0.0, "num_tokens": 54781259.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1721 }, { "completion_length": 825.5833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1187.0, "completions/max_terminated_length": 1187.0, "completions/mean_length": 825.5833740234375, "completions/mean_terminated_length": 825.5833740234375, "completions/min_length": 514.0, "completions/min_terminated_length": 514.0, "epoch": 0.5841248303934871, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.1169772256728776e-07, "loss": 0.0, "num_tokens": 54802758.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1722 }, { "completion_length": 1537.3333740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4464.0, "completions/mean_length": 2086.416748046875, "completions/mean_terminated_length": 1677.0909423828125, "completions/min_length": 374.0, "completions/min_terminated_length": 374.0, "epoch": 0.5844640434192673, "frac_reward_zero_std": 0.5, "grad_norm": 0.37191975116729736, "kl": NaN, "learning_rate": 2.1152518978605934e-07, "loss": -0.0352, "num_tokens": 54829180.0, "reward": 0.6083333492279053, "reward_std": 0.22453658282756805, "rewards/correctness_reward_func/mean": 0.3333333432674408, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 1723 }, { "completion_length": 1098.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3686.0, "completions/max_terminated_length": 3686.0, "completions/mean_length": 1098.666748046875, "completions/mean_terminated_length": 1098.666748046875, "completions/min_length": 407.0, "completions/min_terminated_length": 407.0, "epoch": 0.5848032564450475, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.113526570048309e-07, "loss": 0.0, "num_tokens": 54856476.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1724 }, { "completion_length": 1588.416748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4873.0, "completions/mean_length": 2137.5, "completions/mean_terminated_length": 1732.8182373046875, "completions/min_length": 534.0, "completions/min_terminated_length": 534.0, "epoch": 0.5851424694708277, "frac_reward_zero_std": 0.5, "grad_norm": 0.4410051107406616, "kl": NaN, "learning_rate": 2.1118012422360247e-07, "loss": -0.0087, "num_tokens": 54887807.0, "reward": 1.0416667461395264, "reward_std": 0.24983328580856323, "rewards/correctness_reward_func/mean": 0.7666666507720947, "rewards/correctness_reward_func/std": 0.3700941801071167, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 1725 }, { "completion_length": 630.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 848.0, "completions/max_terminated_length": 848.0, "completions/mean_length": 630.6666870117188, "completions/mean_terminated_length": 630.6666870117188, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.5854816824966079, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.1100759144237405e-07, "loss": 0.0, "num_tokens": 54907429.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1726 }, { "completion_length": 841.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1981.0, "completions/max_terminated_length": 1981.0, "completions/mean_length": 841.9166870117188, "completions/mean_terminated_length": 841.9166870117188, "completions/min_length": 474.0, "completions/min_terminated_length": 474.0, "epoch": 0.585820895522388, "frac_reward_zero_std": 0.5, "grad_norm": 0.06473913043737411, "kl": 0.0, "learning_rate": 2.1083505866114563e-07, "loss": 0.0003, "num_tokens": 54928782.0, "reward": 0.7333333492279053, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.43333330750465393, "rewards/correctness_reward_func/std": 0.45792683959007263, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1727 }, { "completion_length": 2542.75, "completions/clipped_ratio": 0.0, "completions/max_length": 6545.0, "completions/max_terminated_length": 6545.0, "completions/mean_length": 2542.75, "completions/mean_terminated_length": 2542.75, "completions/min_length": 852.0, "completions/min_terminated_length": 852.0, "epoch": 0.5861601085481682, "frac_reward_zero_std": 1.0, "grad_norm": 1.7747044012139668e-07, "kl": 0.0, "learning_rate": 2.1066252587991718e-07, "loss": 0.0, "num_tokens": 54972369.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1728 }, { "completion_length": 3593.3334350585938, "completions/clipped_ratio": 0.33333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6439.0, "completions/mean_length": 5789.6669921875, "completions/mean_terminated_length": 5390.0, "completions/min_length": 3040.0, "completions/min_terminated_length": 3040.0, "epoch": 0.5864993215739485, "frac_reward_zero_std": 0.0, "grad_norm": 0.8373323082923889, "kl": NaN, "learning_rate": 2.1048999309868874e-07, "loss": -0.1173, "num_tokens": 55033105.0, "reward": 0.46666669845581055, "reward_std": 0.36147844791412354, "rewards/correctness_reward_func/mean": 0.2666666805744171, "rewards/correctness_reward_func/std": 0.393892765045166, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.14770980179309845, "step": 1729 }, { "completion_length": 760.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1806.0, "completions/max_terminated_length": 1806.0, "completions/mean_length": 760.9166870117188, "completions/mean_terminated_length": 760.9166870117188, "completions/min_length": 467.0, "completions/min_terminated_length": 467.0, "epoch": 0.5868385345997287, "frac_reward_zero_std": 0.5, "grad_norm": 0.11199633777141571, "kl": 0.0, "learning_rate": 2.1031746031746032e-07, "loss": 0.0001, "num_tokens": 55055388.0, "reward": 1.133333444595337, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1730 }, { "completion_length": 1698.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 5115.0, "completions/max_terminated_length": 5115.0, "completions/mean_length": 1698.416748046875, "completions/mean_terminated_length": 1698.416748046875, "completions/min_length": 775.0, "completions/min_terminated_length": 775.0, "epoch": 0.5871777476255088, "frac_reward_zero_std": 0.5, "grad_norm": 0.5819225311279297, "kl": 0.0, "learning_rate": 2.1014492753623187e-07, "loss": 0.0032, "num_tokens": 55085471.0, "reward": 1.0541666746139526, "reward_std": 0.22716552019119263, "rewards/correctness_reward_func/mean": 0.7666666507720947, "rewards/correctness_reward_func/std": 0.3700941801071167, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 1731 }, { "completion_length": 2175.7501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 3474.0, "completions/max_terminated_length": 3474.0, "completions/mean_length": 2175.75, "completions/mean_terminated_length": 2175.75, "completions/min_length": 626.0, "completions/min_terminated_length": 626.0, "epoch": 0.587516960651289, "frac_reward_zero_std": 1.0, "grad_norm": 1.5091225691321597e-07, "kl": 0.0, "learning_rate": 2.0997239475500345e-07, "loss": 0.0, "num_tokens": 55120838.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1732 }, { "completion_length": 902.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1394.0, "completions/max_terminated_length": 1394.0, "completions/mean_length": 902.25, "completions/mean_terminated_length": 902.25, "completions/min_length": 488.0, "completions/min_terminated_length": 488.0, "epoch": 0.5878561736770692, "frac_reward_zero_std": 0.5, "grad_norm": 0.047891441732645035, "kl": 0.0, "learning_rate": 2.09799861973775e-07, "loss": -0.0007, "num_tokens": 55140587.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1733 }, { "completion_length": 617.5000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 944.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 617.5, "completions/mean_terminated_length": 617.5, "completions/min_length": 378.0, "completions/min_terminated_length": 378.0, "epoch": 0.5881953867028494, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.0962732919254658e-07, "loss": 0.0, "num_tokens": 55160753.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1734 }, { "completion_length": 920.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1968.0, "completions/max_terminated_length": 1968.0, "completions/mean_length": 920.1666870117188, "completions/mean_terminated_length": 920.1666870117188, "completions/min_length": 482.0, "completions/min_terminated_length": 482.0, "epoch": 0.5885345997286295, "frac_reward_zero_std": 0.5, "grad_norm": 0.40745091438293457, "kl": 0.0, "learning_rate": 2.0945479641131813e-07, "loss": 0.0073, "num_tokens": 55185019.0, "reward": 1.1500000953674316, "reward_std": 0.17606817185878754, "rewards/correctness_reward_func/mean": 0.8499999642372131, "rewards/correctness_reward_func/std": 0.28444522619247437, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1735 }, { "completion_length": 1632.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3813.0, "completions/max_terminated_length": 3813.0, "completions/mean_length": 1632.0833740234375, "completions/mean_terminated_length": 1632.0833740234375, "completions/min_length": 532.0, "completions/min_terminated_length": 532.0, "epoch": 0.5888738127544098, "frac_reward_zero_std": 0.5, "grad_norm": 0.1162729263305664, "kl": 0.0, "learning_rate": 2.092822636300897e-07, "loss": -0.0053, "num_tokens": 55216292.0, "reward": 0.7333333492279053, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.43333330750465393, "rewards/correctness_reward_func/std": 0.45792683959007263, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1736 }, { "completion_length": 693.5833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 909.0, "completions/max_terminated_length": 909.0, "completions/mean_length": 693.5833740234375, "completions/mean_terminated_length": 693.5833740234375, "completions/min_length": 547.0, "completions/min_terminated_length": 547.0, "epoch": 0.58921302578019, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.0910973084886127e-07, "loss": 0.0, "num_tokens": 55240539.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1737 }, { "completion_length": 1008.0000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 2013.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 1008.0, "completions/mean_terminated_length": 1008.0, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 0.5895522388059702, "frac_reward_zero_std": 0.5, "grad_norm": 0.0722997859120369, "kl": 0.0, "learning_rate": 2.0893719806763284e-07, "loss": -0.0002, "num_tokens": 55266429.0, "reward": 0.7166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1738 }, { "completion_length": 835.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1553.0, "completions/max_terminated_length": 1553.0, "completions/mean_length": 835.4166870117188, "completions/mean_terminated_length": 835.4166870117188, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.5898914518317503, "frac_reward_zero_std": 0.5, "grad_norm": 0.06362130492925644, "kl": 0.0, "learning_rate": 2.087646652864044e-07, "loss": -0.0004, "num_tokens": 55288550.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1739 }, { "completion_length": 1790.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3941.0, "completions/max_terminated_length": 3941.0, "completions/mean_length": 1790.25, "completions/mean_terminated_length": 1790.25, "completions/min_length": 483.0, "completions/min_terminated_length": 483.0, "epoch": 0.5902306648575305, "frac_reward_zero_std": 0.5, "grad_norm": 0.6119212508201599, "kl": 0.0, "learning_rate": 2.0859213250517595e-07, "loss": 0.0271, "num_tokens": 55323401.0, "reward": 0.9666666388511658, "reward_std": 0.25819891691207886, "rewards/correctness_reward_func/mean": 0.6666666865348816, "rewards/correctness_reward_func/std": 0.4923659861087799, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1740 }, { "completion_length": 533.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 877.0, "completions/max_terminated_length": 877.0, "completions/mean_length": 533.6666870117188, "completions/mean_terminated_length": 533.6666870117188, "completions/min_length": 377.0, "completions/min_terminated_length": 377.0, "epoch": 0.5905698778833107, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.0841959972394753e-07, "loss": 0.0, "num_tokens": 55337815.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1741 }, { "completion_length": 1856.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3658.0, "completions/max_terminated_length": 3658.0, "completions/mean_length": 1856.25, "completions/mean_terminated_length": 1856.25, "completions/min_length": 516.0, "completions/min_terminated_length": 516.0, "epoch": 0.5909090909090909, "frac_reward_zero_std": 1.0, "grad_norm": 2.7585696216192446e-07, "kl": 0.0, "learning_rate": 2.082470669427191e-07, "loss": 0.0, "num_tokens": 55373758.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1742 }, { "completion_length": 558.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 746.0, "completions/max_terminated_length": 746.0, "completions/mean_length": 558.25, "completions/mean_terminated_length": 558.25, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "epoch": 0.591248303934871, "frac_reward_zero_std": 0.5, "grad_norm": 0.049856893718242645, "kl": 0.0, "learning_rate": 2.080745341614907e-07, "loss": 0.0002, "num_tokens": 55391857.0, "reward": 0.2875000238418579, "reward_std": 0.03061862289905548, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 1743 }, { "completion_length": 983.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 1414.0, "completions/max_terminated_length": 1414.0, "completions/mean_length": 983.5, "completions/mean_terminated_length": 983.5, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "epoch": 0.5915875169606513, "frac_reward_zero_std": 0.5, "grad_norm": 0.08654404431581497, "kl": 0.0, "learning_rate": 2.0790200138026224e-07, "loss": -0.0011, "num_tokens": 55414555.0, "reward": 1.133333444595337, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1744 }, { "completion_length": 883.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1383.0, "completions/max_terminated_length": 1383.0, "completions/mean_length": 883.4166870117188, "completions/mean_terminated_length": 883.4166870117188, "completions/min_length": 642.0, "completions/min_terminated_length": 642.0, "epoch": 0.5919267299864315, "frac_reward_zero_std": 0.0, "grad_norm": 0.3005127012729645, "kl": 0.0, "learning_rate": 2.0772946859903382e-07, "loss": -0.0, "num_tokens": 55434636.0, "reward": 1.1000001430511475, "reward_std": 0.23782962560653687, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.2696799635887146, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1745 }, { "completion_length": 813.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1748.0, "completions/max_terminated_length": 1748.0, "completions/mean_length": 813.8333740234375, "completions/mean_terminated_length": 813.8333740234375, "completions/min_length": 445.0, "completions/min_terminated_length": 445.0, "epoch": 0.5922659430122117, "frac_reward_zero_std": 0.5, "grad_norm": 0.06941237300634384, "kl": 0.0, "learning_rate": 2.0755693581780537e-07, "loss": 0.0005, "num_tokens": 55457194.0, "reward": 1.133333444595337, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1746 }, { "completion_length": 1788.8334350585938, "completions/clipped_ratio": 0.0, "completions/max_length": 3672.0, "completions/max_terminated_length": 3672.0, "completions/mean_length": 1788.8333740234375, "completions/mean_terminated_length": 1788.8333740234375, "completions/min_length": 468.0, "completions/min_terminated_length": 468.0, "epoch": 0.5926051560379919, "frac_reward_zero_std": 1.0, "grad_norm": 1.0550185436386528e-07, "kl": 0.0, "learning_rate": 2.0738440303657695e-07, "loss": 0.0, "num_tokens": 55490768.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1747 }, { "completion_length": 2048.7500610351562, "completions/clipped_ratio": 0.33333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5215.0, "completions/mean_length": 4245.08349609375, "completions/mean_terminated_length": 3073.125, "completions/min_length": 1673.0, "completions/min_terminated_length": 1673.0, "epoch": 0.592944369063772, "frac_reward_zero_std": 0.5, "grad_norm": 0.8333843946456909, "kl": NaN, "learning_rate": 2.072118702553485e-07, "loss": -0.0609, "num_tokens": 55525019.0, "reward": 0.6666667461395264, "reward_std": 0.22060523927211761, "rewards/correctness_reward_func/mean": 0.46666669845581055, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.14770980179309845, "step": 1748 }, { "completion_length": 698.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1270.0, "completions/max_terminated_length": 1270.0, "completions/mean_length": 698.25, "completions/mean_terminated_length": 698.25, "completions/min_length": 406.0, "completions/min_terminated_length": 406.0, "epoch": 0.5932835820895522, "frac_reward_zero_std": 0.0, "grad_norm": 0.30767449736595154, "kl": 0.0, "learning_rate": 2.0703933747412008e-07, "loss": 0.0008, "num_tokens": 55551218.0, "reward": 1.1666667461395264, "reward_std": 0.2588964104652405, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.287096232175827, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1749 }, { "completion_length": 2199.0834350585938, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6380.0, "completions/mean_length": 2748.166748046875, "completions/mean_terminated_length": 2399.0, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "epoch": 0.5936227951153324, "frac_reward_zero_std": 0.0, "grad_norm": 0.09802600741386414, "kl": NaN, "learning_rate": 2.0686680469289164e-07, "loss": -0.0076, "num_tokens": 55589295.0, "reward": 0.7541667819023132, "reward_std": 0.08225837349891663, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 1750 }, { "completion_length": 998.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2544.0, "completions/max_terminated_length": 2544.0, "completions/mean_length": 998.75, "completions/mean_terminated_length": 998.75, "completions/min_length": 638.0, "completions/min_terminated_length": 638.0, "epoch": 0.5939620081411127, "frac_reward_zero_std": 0.0, "grad_norm": 0.09893840551376343, "kl": 0.0, "learning_rate": 2.0669427191166322e-07, "loss": -0.0006, "num_tokens": 55617006.0, "reward": 1.25, "reward_std": 0.09246455132961273, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1751 }, { "completion_length": 1055.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2101.0, "completions/max_terminated_length": 2101.0, "completions/mean_length": 1055.5833740234375, "completions/mean_terminated_length": 1055.5833740234375, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "epoch": 0.5943012211668928, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.0652173913043477e-07, "loss": 0.0, "num_tokens": 55636945.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1752 }, { "completion_length": 936.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1575.0, "completions/max_terminated_length": 1575.0, "completions/mean_length": 936.6666870117188, "completions/mean_terminated_length": 936.6666870117188, "completions/min_length": 484.0, "completions/min_terminated_length": 484.0, "epoch": 0.594640434192673, "frac_reward_zero_std": 0.0, "grad_norm": 0.11101523041725159, "kl": 0.0, "learning_rate": 2.0634920634920632e-07, "loss": 0.0008, "num_tokens": 55659945.0, "reward": 1.2333333492279053, "reward_std": 0.10327950119972229, "rewards/correctness_reward_func/mean": 0.9333333373069763, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1753 }, { "completion_length": 803.0833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1239.0, "completions/max_terminated_length": 1239.0, "completions/mean_length": 803.0833740234375, "completions/mean_terminated_length": 803.0833740234375, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "epoch": 0.5949796472184532, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.061766735679779e-07, "loss": 0.0, "num_tokens": 55680316.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1754 }, { "completion_length": 701.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1052.0, "completions/max_terminated_length": 1052.0, "completions/mean_length": 701.25, "completions/mean_terminated_length": 701.25, "completions/min_length": 463.0, "completions/min_terminated_length": 463.0, "epoch": 0.5953188602442334, "frac_reward_zero_std": 0.5, "grad_norm": 0.04718813672661781, "kl": 0.0, "learning_rate": 2.0600414078674945e-07, "loss": 0.0005, "num_tokens": 55699255.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1755 }, { "completion_length": 714.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1636.0, "completions/max_terminated_length": 1636.0, "completions/mean_length": 714.9166870117188, "completions/mean_terminated_length": 714.9166870117188, "completions/min_length": 454.0, "completions/min_terminated_length": 454.0, "epoch": 0.5956580732700135, "frac_reward_zero_std": 1.0, "grad_norm": 1.6128753088651138e-07, "kl": 0.0, "learning_rate": 2.0583160800552103e-07, "loss": 0.0, "num_tokens": 55721454.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1756 }, { "completion_length": 732.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1250.0, "completions/max_terminated_length": 1250.0, "completions/mean_length": 732.25, "completions/mean_terminated_length": 732.25, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "epoch": 0.5959972862957937, "frac_reward_zero_std": 0.5, "grad_norm": 0.08978830277919769, "kl": 0.0, "learning_rate": 2.056590752242926e-07, "loss": -0.0013, "num_tokens": 55740849.0, "reward": 1.133333444595337, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1757 }, { "completion_length": 831.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1940.0, "completions/max_terminated_length": 1940.0, "completions/mean_length": 831.8333740234375, "completions/mean_terminated_length": 831.8333740234375, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.596336499321574, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.054865424430642e-07, "loss": 0.0, "num_tokens": 55760023.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1758 }, { "completion_length": 1710.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3242.0, "completions/max_terminated_length": 3242.0, "completions/mean_length": 1710.5833740234375, "completions/mean_terminated_length": 1710.5833740234375, "completions/min_length": 671.0, "completions/min_terminated_length": 671.0, "epoch": 0.5966757123473542, "frac_reward_zero_std": 0.5, "grad_norm": 0.09654907882213593, "kl": 0.0, "learning_rate": 2.0531400966183575e-07, "loss": -0.001, "num_tokens": 55793648.0, "reward": 1.1500000953674316, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.8500000834465027, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1759 }, { "completion_length": 2501.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5877.0, "completions/max_terminated_length": 5877.0, "completions/mean_length": 2501.916748046875, "completions/mean_terminated_length": 2501.916748046875, "completions/min_length": 418.0, "completions/min_terminated_length": 418.0, "epoch": 0.5970149253731343, "frac_reward_zero_std": 0.5, "grad_norm": 0.08622410148382187, "kl": 0.0, "learning_rate": 2.0514147688060732e-07, "loss": -0.0016, "num_tokens": 55840399.0, "reward": 1.133333444595337, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1760 }, { "completion_length": 1199.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 2081.0, "completions/max_terminated_length": 2081.0, "completions/mean_length": 1199.8333740234375, "completions/mean_terminated_length": 1199.8333740234375, "completions/min_length": 593.0, "completions/min_terminated_length": 593.0, "epoch": 0.5973541383989145, "frac_reward_zero_std": 0.5, "grad_norm": 0.1458306759595871, "kl": 0.0, "learning_rate": 2.0496894409937888e-07, "loss": -0.0003, "num_tokens": 55867007.0, "reward": 1.2666666507720947, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1761 }, { "completion_length": 1583.5, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4787.0, "completions/mean_length": 2132.58349609375, "completions/mean_terminated_length": 1727.45458984375, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 0.5976933514246947, "frac_reward_zero_std": 0.5, "grad_norm": 0.03182739019393921, "kl": NaN, "learning_rate": 2.0479641131815046e-07, "loss": -0.0052, "num_tokens": 55896701.0, "reward": 0.2875000238418579, "reward_std": 0.03061862289905548, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 1762 }, { "completion_length": 956.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1698.0, "completions/max_terminated_length": 1698.0, "completions/mean_length": 956.4166870117188, "completions/mean_terminated_length": 956.4166870117188, "completions/min_length": 502.0, "completions/min_terminated_length": 502.0, "epoch": 0.5980325644504749, "frac_reward_zero_std": 1.0, "grad_norm": 1.2552914085972589e-07, "kl": 0.0, "learning_rate": 2.04623878536922e-07, "loss": 0.0, "num_tokens": 55922032.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1763 }, { "completion_length": 910.4166870117188, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 3023.0, "completions/mean_length": 2008.5833740234375, "completions/mean_terminated_length": 1092.5, "completions/min_length": 635.0, "completions/min_terminated_length": 635.0, "epoch": 0.5983717774762551, "frac_reward_zero_std": 0.5, "grad_norm": 0.47984209656715393, "kl": NaN, "learning_rate": 2.0445134575569356e-07, "loss": -0.0284, "num_tokens": 55947585.0, "reward": 0.8166667222976685, "reward_std": 0.20165978372097015, "rewards/correctness_reward_func/mean": 0.5666666626930237, "rewards/correctness_reward_func/std": 0.5033223032951355, "rewards/format_reward_func/mean": 0.2500000298023224, "rewards/format_reward_func/std": 0.11677484214305878, "step": 1764 }, { "completion_length": 1338.4166870117188, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 6382.0, "completions/mean_length": 2985.666748046875, "completions/mean_terminated_length": 1784.5555419921875, "completions/min_length": 593.0, "completions/min_terminated_length": 593.0, "epoch": 0.5987109905020352, "frac_reward_zero_std": 0.0, "grad_norm": 0.7331798076629639, "kl": NaN, "learning_rate": 2.0427881297446514e-07, "loss": -0.0342, "num_tokens": 55974344.0, "reward": 0.7916667461395264, "reward_std": 0.32259485125541687, "rewards/correctness_reward_func/mean": 0.5666666626930237, "rewards/correctness_reward_func/std": 0.42497774958610535, "rewards/format_reward_func/mean": 0.2250000238418579, "rewards/format_reward_func/std": 0.13568010926246643, "step": 1765 }, { "completion_length": 2746.5, "completions/clipped_ratio": 0.0, "completions/max_length": 6507.0, "completions/max_terminated_length": 6507.0, "completions/mean_length": 2746.5, "completions/mean_terminated_length": 2746.5, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "epoch": 0.5990502035278155, "frac_reward_zero_std": 0.5, "grad_norm": 0.6906976103782654, "kl": 0.0, "learning_rate": 2.041062801932367e-07, "loss": 0.0161, "num_tokens": 56018522.0, "reward": 0.4333333671092987, "reward_std": 0.20655912160873413, "rewards/correctness_reward_func/mean": 0.13333334028720856, "rewards/correctness_reward_func/std": 0.31139957904815674, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1766 }, { "completion_length": 586.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1000.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 586.4166870117188, "completions/mean_terminated_length": 586.4166870117188, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "epoch": 0.5993894165535957, "frac_reward_zero_std": 1.0, "grad_norm": 1.1772424102218793e-07, "kl": 0.0, "learning_rate": 2.0393374741200827e-07, "loss": 0.0, "num_tokens": 56036965.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1767 }, { "completion_length": 1054.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3260.0, "completions/max_terminated_length": 3260.0, "completions/mean_length": 1054.0833740234375, "completions/mean_terminated_length": 1054.0833740234375, "completions/min_length": 362.0, "completions/min_terminated_length": 362.0, "epoch": 0.5997286295793759, "frac_reward_zero_std": 0.5, "grad_norm": 0.08895274996757507, "kl": 0.0, "learning_rate": 2.0376121463077983e-07, "loss": 0.0011, "num_tokens": 56062214.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1768 }, { "completion_length": 530.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 681.0, "completions/max_terminated_length": 681.0, "completions/mean_length": 530.0833740234375, "completions/mean_terminated_length": 530.0833740234375, "completions/min_length": 383.0, "completions/min_terminated_length": 383.0, "epoch": 0.600067842605156, "frac_reward_zero_std": 0.5, "grad_norm": 0.03645140305161476, "kl": 0.0, "learning_rate": 2.035886818495514e-07, "loss": -0.0003, "num_tokens": 56078859.0, "reward": 0.7833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1769 }, { "completion_length": 1155.1667175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 3364.0, "completions/max_terminated_length": 3364.0, "completions/mean_length": 1155.166748046875, "completions/mean_terminated_length": 1155.166748046875, "completions/min_length": 609.0, "completions/min_terminated_length": 609.0, "epoch": 0.6004070556309362, "frac_reward_zero_std": 0.5, "grad_norm": 0.6097166538238525, "kl": 0.0, "learning_rate": 2.0341614906832296e-07, "loss": 0.0214, "num_tokens": 56109149.0, "reward": 1.0500000715255737, "reward_std": 0.17606817185878754, "rewards/correctness_reward_func/mean": 0.75, "rewards/correctness_reward_func/std": 0.24308621883392334, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1770 }, { "completion_length": 1121.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 6306.0, "completions/max_terminated_length": 6306.0, "completions/mean_length": 1121.3333740234375, "completions/mean_terminated_length": 1121.3333740234375, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "epoch": 0.6007462686567164, "frac_reward_zero_std": 0.5, "grad_norm": 0.36625170707702637, "kl": 0.0, "learning_rate": 2.0324361628709454e-07, "loss": -0.0105, "num_tokens": 56137323.0, "reward": 1.2166666984558105, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1771 }, { "completion_length": 1330.9167175292969, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 5761.0, "completions/mean_length": 2429.08349609375, "completions/mean_terminated_length": 1597.0999755859375, "completions/min_length": 521.0, "completions/min_terminated_length": 521.0, "epoch": 0.6010854816824966, "frac_reward_zero_std": 0.5, "grad_norm": 0.3001353144645691, "kl": NaN, "learning_rate": 2.030710835058661e-07, "loss": -0.0083, "num_tokens": 56169134.0, "reward": 0.8166667222976685, "reward_std": 0.20165979862213135, "rewards/correctness_reward_func/mean": 0.5666666626930237, "rewards/correctness_reward_func/std": 0.5033223032951355, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.11677484214305878, "step": 1772 }, { "completion_length": 1499.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3154.0, "completions/max_terminated_length": 3154.0, "completions/mean_length": 1499.3333740234375, "completions/mean_terminated_length": 1499.3333740234375, "completions/min_length": 416.0, "completions/min_terminated_length": 416.0, "epoch": 0.6014246947082768, "frac_reward_zero_std": 1.0, "grad_norm": 9.075550622128503e-08, "kl": 0.0, "learning_rate": 2.028985507246377e-07, "loss": 0.0, "num_tokens": 56196300.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1773 }, { "completion_length": 1734.5, "completions/clipped_ratio": 0.0, "completions/max_length": 6089.0, "completions/max_terminated_length": 6089.0, "completions/mean_length": 1734.5, "completions/mean_terminated_length": 1734.5, "completions/min_length": 451.0, "completions/min_terminated_length": 451.0, "epoch": 0.601763907734057, "frac_reward_zero_std": 0.5, "grad_norm": 0.6828683614730835, "kl": 0.0, "learning_rate": 2.0272601794340925e-07, "loss": 0.0257, "num_tokens": 56225946.0, "reward": 0.5, "reward_std": 0.21908903121948242, "rewards/correctness_reward_func/mean": 0.20000000298023224, "rewards/correctness_reward_func/std": 0.36181363463401794, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1774 }, { "completion_length": 1548.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3627.0, "completions/max_terminated_length": 3627.0, "completions/mean_length": 1548.0, "completions/mean_terminated_length": 1548.0, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.6021031207598372, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.025534851621808e-07, "loss": 0.0, "num_tokens": 56256444.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1775 }, { "completion_length": 1478.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2252.0, "completions/max_terminated_length": 2252.0, "completions/mean_length": 1478.666748046875, "completions/mean_terminated_length": 1478.666748046875, "completions/min_length": 819.0, "completions/min_terminated_length": 819.0, "epoch": 0.6024423337856174, "frac_reward_zero_std": 0.5, "grad_norm": 0.6188534498214722, "kl": 0.0, "learning_rate": 2.0238095238095238e-07, "loss": 0.0037, "num_tokens": 56285564.0, "reward": 1.0499999523162842, "reward_std": 0.27386125922203064, "rewards/correctness_reward_func/mean": 0.75, "rewards/correctness_reward_func/std": 0.45226702094078064, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1776 }, { "completion_length": 1674.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 4907.0, "completions/max_terminated_length": 4907.0, "completions/mean_length": 1674.5, "completions/mean_terminated_length": 1674.5, "completions/min_length": 691.0, "completions/min_terminated_length": 691.0, "epoch": 0.6027815468113975, "frac_reward_zero_std": 1.0, "grad_norm": 8.571488763209345e-08, "kl": 0.0, "learning_rate": 2.0220841959972393e-07, "loss": 0.0, "num_tokens": 56316146.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1777 }, { "completion_length": 1295.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2780.0, "completions/max_terminated_length": 2780.0, "completions/mean_length": 1295.5, "completions/mean_terminated_length": 1295.5, "completions/min_length": 625.0, "completions/min_terminated_length": 625.0, "epoch": 0.6031207598371777, "frac_reward_zero_std": 0.5, "grad_norm": 0.09135447442531586, "kl": 0.0, "learning_rate": 2.0203588681849551e-07, "loss": 0.0003, "num_tokens": 56340236.0, "reward": 1.2166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1778 }, { "completion_length": 1291.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1962.0, "completions/max_terminated_length": 1962.0, "completions/mean_length": 1291.3333740234375, "completions/mean_terminated_length": 1291.3333740234375, "completions/min_length": 693.0, "completions/min_terminated_length": 693.0, "epoch": 0.6034599728629579, "frac_reward_zero_std": 0.5, "grad_norm": 0.080439992249012, "kl": 0.0, "learning_rate": 2.0186335403726707e-07, "loss": -0.0003, "num_tokens": 56363622.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1779 }, { "completion_length": 1583.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3874.0, "completions/max_terminated_length": 3874.0, "completions/mean_length": 1583.0833740234375, "completions/mean_terminated_length": 1583.0833740234375, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.6037991858887382, "frac_reward_zero_std": 0.5, "grad_norm": 0.07302762567996979, "kl": 0.0, "learning_rate": 2.0169082125603865e-07, "loss": -0.0004, "num_tokens": 56394919.0, "reward": 0.7333333492279053, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.43333330750465393, "rewards/correctness_reward_func/std": 0.45792683959007263, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1780 }, { "completion_length": 637.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1218.0, "completions/max_terminated_length": 1218.0, "completions/mean_length": 637.75, "completions/mean_terminated_length": 637.75, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.6041383989145184, "frac_reward_zero_std": 1.0, "grad_norm": 9.144230972424339e-08, "kl": 0.0, "learning_rate": 2.015182884748102e-07, "loss": 0.0, "num_tokens": 56412820.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1781 }, { "completion_length": 1081.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1836.0, "completions/max_terminated_length": 1836.0, "completions/mean_length": 1081.25, "completions/mean_terminated_length": 1081.25, "completions/min_length": 565.0, "completions/min_terminated_length": 565.0, "epoch": 0.6044776119402985, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.0134575569358178e-07, "loss": 0.0, "num_tokens": 56440795.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1782 }, { "completion_length": 958.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1814.0, "completions/max_terminated_length": 1814.0, "completions/mean_length": 958.1666870117188, "completions/mean_terminated_length": 958.1666870117188, "completions/min_length": 639.0, "completions/min_terminated_length": 639.0, "epoch": 0.6048168249660787, "frac_reward_zero_std": 0.5, "grad_norm": 0.06365250051021576, "kl": 0.0, "learning_rate": 2.0117322291235333e-07, "loss": 0.0002, "num_tokens": 56467275.0, "reward": 0.7833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1783 }, { "completion_length": 1464.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2444.0, "completions/max_terminated_length": 2444.0, "completions/mean_length": 1464.0833740234375, "completions/mean_terminated_length": 1464.0833740234375, "completions/min_length": 710.0, "completions/min_terminated_length": 710.0, "epoch": 0.6051560379918589, "frac_reward_zero_std": 0.0, "grad_norm": 0.37266993522644043, "kl": 0.0, "learning_rate": 2.010006901311249e-07, "loss": -0.004, "num_tokens": 56495170.0, "reward": 1.0833333730697632, "reward_std": 0.21807155013084412, "rewards/correctness_reward_func/mean": 0.7833333015441895, "rewards/correctness_reward_func/std": 0.26227444410324097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1784 }, { "completion_length": 889.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1236.0, "completions/max_terminated_length": 1236.0, "completions/mean_length": 889.9166870117188, "completions/mean_terminated_length": 889.9166870117188, "completions/min_length": 503.0, "completions/min_terminated_length": 503.0, "epoch": 0.6054952510176391, "frac_reward_zero_std": 1.0, "grad_norm": 8.1980068955545e-08, "kl": 0.0, "learning_rate": 2.0082815734989646e-07, "loss": 0.0, "num_tokens": 56515935.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1785 }, { "completion_length": 1003.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 2111.0, "completions/max_terminated_length": 2111.0, "completions/mean_length": 1003.3333740234375, "completions/mean_terminated_length": 1003.3333740234375, "completions/min_length": 656.0, "completions/min_terminated_length": 656.0, "epoch": 0.6058344640434192, "frac_reward_zero_std": 0.5, "grad_norm": 0.39528313279151917, "kl": 0.0, "learning_rate": 2.0065562456866802e-07, "loss": 0.0023, "num_tokens": 56542915.0, "reward": 0.8666666746139526, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.5666666626930237, "rewards/correctness_reward_func/std": 0.5033223032951355, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1786 }, { "completion_length": 669.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1019.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 669.25, "completions/mean_terminated_length": 669.25, "completions/min_length": 442.0, "completions/min_terminated_length": 442.0, "epoch": 0.6061736770691994, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.004830917874396e-07, "loss": 0.0, "num_tokens": 56564284.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1787 }, { "completion_length": 1189.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 1830.0, "completions/max_terminated_length": 1830.0, "completions/mean_length": 1189.166748046875, "completions/mean_terminated_length": 1189.166748046875, "completions/min_length": 608.0, "completions/min_terminated_length": 608.0, "epoch": 0.6065128900949797, "frac_reward_zero_std": 0.5, "grad_norm": 0.30892810225486755, "kl": 0.0, "learning_rate": 2.0031055900621115e-07, "loss": -0.0047, "num_tokens": 56595624.0, "reward": 1.1500000953674316, "reward_std": 0.17606817185878754, "rewards/correctness_reward_func/mean": 0.8499999642372131, "rewards/correctness_reward_func/std": 0.28444522619247437, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1788 }, { "completion_length": 745.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1408.0, "completions/max_terminated_length": 1408.0, "completions/mean_length": 745.1666870117188, "completions/mean_terminated_length": 745.1666870117188, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 0.6068521031207599, "frac_reward_zero_std": 0.5, "grad_norm": 0.055821310728788376, "kl": 0.0, "learning_rate": 2.0013802622498275e-07, "loss": 0.0, "num_tokens": 56621180.0, "reward": 0.7666666507720947, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1789 }, { "completion_length": 661.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 944.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 661.75, "completions/mean_terminated_length": 661.75, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "epoch": 0.60719131614654, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.999654934437543e-07, "loss": 0.0, "num_tokens": 56641337.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1790 }, { "completion_length": 1899.5833740234375, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 5579.0, "completions/mean_length": 2997.75, "completions/mean_terminated_length": 2279.5, "completions/min_length": 375.0, "completions/min_terminated_length": 375.0, "epoch": 0.6075305291723202, "frac_reward_zero_std": 0.5, "grad_norm": 1.5340323448181152, "kl": NaN, "learning_rate": 1.9979296066252589e-07, "loss": -0.1109, "num_tokens": 56674512.0, "reward": 1.0833332538604736, "reward_std": 0.3356585204601288, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.3892494738101959, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.11677484214305878, "step": 1791 }, { "completion_length": 769.5000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1716.0, "completions/max_terminated_length": 1716.0, "completions/mean_length": 769.5, "completions/mean_terminated_length": 769.5, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "epoch": 0.6078697421981004, "frac_reward_zero_std": 0.5, "grad_norm": 0.07955897599458694, "kl": 0.0, "learning_rate": 1.9962042788129744e-07, "loss": 0.0019, "num_tokens": 56696790.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1792 }, { "completion_length": 582.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 787.0, "completions/max_terminated_length": 787.0, "completions/mean_length": 582.5833740234375, "completions/mean_terminated_length": 582.5833740234375, "completions/min_length": 376.0, "completions/min_terminated_length": 376.0, "epoch": 0.6082089552238806, "frac_reward_zero_std": 0.5, "grad_norm": 0.054953865706920624, "kl": 0.0, "learning_rate": 1.9944789510006902e-07, "loss": 0.0001, "num_tokens": 56714731.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1793 }, { "completion_length": 662.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1275.0, "completions/max_terminated_length": 1275.0, "completions/mean_length": 662.3333740234375, "completions/mean_terminated_length": 662.3333740234375, "completions/min_length": 409.0, "completions/min_terminated_length": 409.0, "epoch": 0.6085481682496607, "frac_reward_zero_std": 0.5, "grad_norm": 0.20525091886520386, "kl": 0.0, "learning_rate": 1.9927536231884057e-07, "loss": 0.0009, "num_tokens": 56736347.0, "reward": 1.133333444595337, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.2806917726993561, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1794 }, { "completion_length": 912.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 1355.0, "completions/max_terminated_length": 1355.0, "completions/mean_length": 912.5, "completions/mean_terminated_length": 912.5, "completions/min_length": 478.0, "completions/min_terminated_length": 478.0, "epoch": 0.608887381275441, "frac_reward_zero_std": 1.0, "grad_norm": 1.838018732769342e-07, "kl": 0.0, "learning_rate": 1.9910282953761215e-07, "loss": 0.0, "num_tokens": 56758601.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1795 }, { "completion_length": 1286.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2660.0, "completions/max_terminated_length": 2660.0, "completions/mean_length": 1286.75, "completions/mean_terminated_length": 1286.75, "completions/min_length": 486.0, "completions/min_terminated_length": 486.0, "epoch": 0.6092265943012212, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.989302967563837e-07, "loss": 0.0, "num_tokens": 56784776.0, "reward": 0.30000001192092896, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1796 }, { "completion_length": 1054.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 1764.0, "completions/max_terminated_length": 1764.0, "completions/mean_length": 1054.75, "completions/mean_terminated_length": 1054.75, "completions/min_length": 604.0, "completions/min_terminated_length": 604.0, "epoch": 0.6095658073270014, "frac_reward_zero_std": 0.0, "grad_norm": 1.015573501586914, "kl": 0.0, "learning_rate": 1.9875776397515526e-07, "loss": -0.0024, "num_tokens": 56808257.0, "reward": 0.9166667461395264, "reward_std": 0.2599138617515564, "rewards/correctness_reward_func/mean": 0.6166666746139526, "rewards/correctness_reward_func/std": 0.37618499994277954, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1797 }, { "completion_length": 1389.9167175292969, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 3879.0, "completions/mean_length": 3037.166748046875, "completions/mean_terminated_length": 1853.2222900390625, "completions/min_length": 457.0, "completions/min_terminated_length": 457.0, "epoch": 0.6099050203527816, "frac_reward_zero_std": 0.0, "grad_norm": 1.0421550273895264, "kl": NaN, "learning_rate": 1.9858523119392684e-07, "loss": -0.0896, "num_tokens": 56836840.0, "reward": 0.7791666984558105, "reward_std": 0.5275882482528687, "rewards/correctness_reward_func/mean": 0.5666666626930237, "rewards/correctness_reward_func/std": 0.42497774958610535, "rewards/format_reward_func/mean": 0.21250002086162567, "rewards/format_reward_func/std": 0.13505050539970398, "step": 1798 }, { "completion_length": 1128.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2204.0, "completions/max_terminated_length": 2204.0, "completions/mean_length": 1128.25, "completions/mean_terminated_length": 1128.25, "completions/min_length": 552.0, "completions/min_terminated_length": 552.0, "epoch": 0.6102442333785617, "frac_reward_zero_std": 1.0, "grad_norm": 9.843075332582885e-08, "kl": 0.0, "learning_rate": 1.984126984126984e-07, "loss": 0.0, "num_tokens": 56862055.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1799 }, { "completion_length": 735.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1317.0, "completions/max_terminated_length": 1317.0, "completions/mean_length": 735.3333740234375, "completions/mean_terminated_length": 735.3333740234375, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.6105834464043419, "frac_reward_zero_std": 1.0, "grad_norm": 1.7668583041086094e-07, "kl": 0.0, "learning_rate": 1.9824016563146997e-07, "loss": 0.0, "num_tokens": 56880599.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1800 }, { "completion_length": 1856.7501220703125, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 4155.0, "completions/mean_length": 2954.916748046875, "completions/mean_terminated_length": 2228.10009765625, "completions/min_length": 1011.0, "completions/min_terminated_length": 1011.0, "epoch": 0.6109226594301221, "frac_reward_zero_std": 0.0, "grad_norm": 0.17424151301383972, "kl": NaN, "learning_rate": 1.9806763285024152e-07, "loss": -0.017, "num_tokens": 56912888.0, "reward": 0.23750004172325134, "reward_std": 0.10807829350233078, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.23749999701976776, "rewards/format_reward_func/std": 0.11894422769546509, "step": 1801 }, { "completion_length": 1079.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2122.0, "completions/max_terminated_length": 2122.0, "completions/mean_length": 1079.0833740234375, "completions/mean_terminated_length": 1079.0833740234375, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.6112618724559024, "frac_reward_zero_std": 0.0, "grad_norm": 0.3305732011795044, "kl": 0.0, "learning_rate": 1.978951000690131e-07, "loss": -0.004, "num_tokens": 56936103.0, "reward": 1.120833396911621, "reward_std": 0.19391795992851257, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.2806917726993561, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 1802 }, { "completion_length": 991.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1580.0, "completions/max_terminated_length": 1580.0, "completions/mean_length": 991.8333740234375, "completions/mean_terminated_length": 991.8333740234375, "completions/min_length": 557.0, "completions/min_terminated_length": 557.0, "epoch": 0.6116010854816825, "frac_reward_zero_std": 0.5, "grad_norm": 0.07604963332414627, "kl": 0.0, "learning_rate": 1.9772256728778465e-07, "loss": 0.0002, "num_tokens": 56956057.0, "reward": 0.7166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1803 }, { "completion_length": 1059.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 4142.0, "completions/max_terminated_length": 4142.0, "completions/mean_length": 1059.166748046875, "completions/mean_terminated_length": 1059.166748046875, "completions/min_length": 480.0, "completions/min_terminated_length": 480.0, "epoch": 0.6119402985074627, "frac_reward_zero_std": 1.0, "grad_norm": 2.2112412523256353e-07, "kl": 0.0, "learning_rate": 1.9755003450655626e-07, "loss": 0.0, "num_tokens": 56983761.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1804 }, { "completion_length": 1374.5833740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 3188.0, "completions/mean_length": 1923.666748046875, "completions/mean_terminated_length": 1499.5455322265625, "completions/min_length": 492.0, "completions/min_terminated_length": 492.0, "epoch": 0.6122795115332429, "frac_reward_zero_std": 0.5, "grad_norm": 0.02621053159236908, "kl": NaN, "learning_rate": 1.973775017253278e-07, "loss": -0.0037, "num_tokens": 57011674.0, "reward": 0.6875001788139343, "reward_std": 0.03061862289905548, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 1805 }, { "completion_length": 2718.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 6049.0, "completions/max_terminated_length": 6049.0, "completions/mean_length": 2718.58349609375, "completions/mean_terminated_length": 2718.58349609375, "completions/min_length": 641.0, "completions/min_terminated_length": 641.0, "epoch": 0.6126187245590231, "frac_reward_zero_std": 0.0, "grad_norm": 0.1833844780921936, "kl": 0.0, "learning_rate": 1.972049689440994e-07, "loss": 0.0049, "num_tokens": 57056705.0, "reward": 1.2333333492279053, "reward_std": 0.0955970510840416, "rewards/correctness_reward_func/mean": 0.9333333373069763, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1806 }, { "completion_length": 1073.1667175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 1858.0, "completions/max_terminated_length": 1858.0, "completions/mean_length": 1073.166748046875, "completions/mean_terminated_length": 1073.166748046875, "completions/min_length": 717.0, "completions/min_terminated_length": 717.0, "epoch": 0.6129579375848032, "frac_reward_zero_std": 1.0, "grad_norm": 1.5407492526264832e-07, "kl": 0.0, "learning_rate": 1.9703243616287094e-07, "loss": 0.0, "num_tokens": 57079477.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1807 }, { "completion_length": 1102.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1686.0, "completions/max_terminated_length": 1686.0, "completions/mean_length": 1102.25, "completions/mean_terminated_length": 1102.25, "completions/min_length": 633.0, "completions/min_terminated_length": 633.0, "epoch": 0.6132971506105834, "frac_reward_zero_std": 0.0, "grad_norm": 0.10090050846338272, "kl": 0.0, "learning_rate": 1.9685990338164252e-07, "loss": -0.0016, "num_tokens": 57107560.0, "reward": 1.183333396911621, "reward_std": 0.09246458858251572, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1808 }, { "completion_length": 1587.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3671.0, "completions/max_terminated_length": 3671.0, "completions/mean_length": 1587.25, "completions/mean_terminated_length": 1587.25, "completions/min_length": 631.0, "completions/min_terminated_length": 631.0, "epoch": 0.6136363636363636, "frac_reward_zero_std": 0.0, "grad_norm": 0.5370144248008728, "kl": 0.0, "learning_rate": 1.9668737060041408e-07, "loss": 0.0151, "num_tokens": 57139255.0, "reward": 1.1166667938232422, "reward_std": 0.24096208810806274, "rewards/correctness_reward_func/mean": 0.8166666030883789, "rewards/correctness_reward_func/std": 0.27579087018966675, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1809 }, { "completion_length": 1752.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 5945.0, "completions/max_terminated_length": 5945.0, "completions/mean_length": 1752.3333740234375, "completions/mean_terminated_length": 1752.3333740234375, "completions/min_length": 542.0, "completions/min_terminated_length": 542.0, "epoch": 0.6139755766621439, "frac_reward_zero_std": 0.5, "grad_norm": 0.0604725144803524, "kl": 0.0, "learning_rate": 1.9651483781918563e-07, "loss": 0.0001, "num_tokens": 57169751.0, "reward": 0.7833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1810 }, { "completion_length": 1107.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2317.0, "completions/max_terminated_length": 2317.0, "completions/mean_length": 1107.75, "completions/mean_terminated_length": 1107.75, "completions/min_length": 491.0, "completions/min_terminated_length": 491.0, "epoch": 0.614314789687924, "frac_reward_zero_std": 0.5, "grad_norm": 0.10557481646537781, "kl": 0.0, "learning_rate": 1.963423050379572e-07, "loss": 0.001, "num_tokens": 57194900.0, "reward": 1.2666666507720947, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1811 }, { "completion_length": 1020.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2458.0, "completions/max_terminated_length": 2458.0, "completions/mean_length": 1020.75, "completions/mean_terminated_length": 1020.75, "completions/min_length": 569.0, "completions/min_terminated_length": 569.0, "epoch": 0.6146540027137042, "frac_reward_zero_std": 0.5, "grad_norm": 0.07685381174087524, "kl": 0.0, "learning_rate": 1.9616977225672876e-07, "loss": -0.0009, "num_tokens": 57218111.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1812 }, { "completion_length": 809.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1324.0, "completions/max_terminated_length": 1324.0, "completions/mean_length": 809.25, "completions/mean_terminated_length": 809.25, "completions/min_length": 512.0, "completions/min_terminated_length": 512.0, "epoch": 0.6149932157394844, "frac_reward_zero_std": 0.5, "grad_norm": 0.040425386279821396, "kl": 0.0, "learning_rate": 1.9599723947550034e-07, "loss": -0.0003, "num_tokens": 57242108.0, "reward": 0.7833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1813 }, { "completion_length": 1037.0833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1298.0, "completions/max_terminated_length": 1298.0, "completions/mean_length": 1037.0833740234375, "completions/mean_terminated_length": 1037.0833740234375, "completions/min_length": 698.0, "completions/min_terminated_length": 698.0, "epoch": 0.6153324287652646, "frac_reward_zero_std": 1.0, "grad_norm": 1.7933778906353837e-07, "kl": 0.0, "learning_rate": 1.958247066942719e-07, "loss": 0.0, "num_tokens": 57266643.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1814 }, { "completion_length": 675.5833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1057.0, "completions/max_terminated_length": 1057.0, "completions/mean_length": 675.5833740234375, "completions/mean_terminated_length": 675.5833740234375, "completions/min_length": 432.0, "completions/min_terminated_length": 432.0, "epoch": 0.6156716417910447, "frac_reward_zero_std": 1.0, "grad_norm": 8.153984509817747e-08, "kl": 0.0, "learning_rate": 1.9565217391304347e-07, "loss": 0.0, "num_tokens": 57286084.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1815 }, { "completion_length": 1443.0833740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 3696.0, "completions/mean_length": 1992.166748046875, "completions/mean_terminated_length": 1574.2728271484375, "completions/min_length": 521.0, "completions/min_terminated_length": 521.0, "epoch": 0.6160108548168249, "frac_reward_zero_std": 0.5, "grad_norm": 0.20801404118537903, "kl": NaN, "learning_rate": 1.9547964113181502e-07, "loss": -0.0335, "num_tokens": 57316979.0, "reward": 0.6916667819023132, "reward_std": 0.26536139845848083, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.5149286985397339, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 1816 }, { "completion_length": 747.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1624.0, "completions/max_terminated_length": 1624.0, "completions/mean_length": 747.3333740234375, "completions/mean_terminated_length": 747.3333740234375, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.6163500678426052, "frac_reward_zero_std": 1.0, "grad_norm": 1.2694749784714077e-07, "kl": 0.0, "learning_rate": 1.953071083505866e-07, "loss": 0.0, "num_tokens": 57340257.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1817 }, { "completion_length": 1395.5833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 3286.0, "completions/max_terminated_length": 3286.0, "completions/mean_length": 1395.5833740234375, "completions/mean_terminated_length": 1395.5833740234375, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "epoch": 0.6166892808683854, "frac_reward_zero_std": 1.0, "grad_norm": 1.8553282643551938e-07, "kl": 0.0, "learning_rate": 1.9513457556935816e-07, "loss": 0.0, "num_tokens": 57370492.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1818 }, { "completion_length": 857.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1287.0, "completions/max_terminated_length": 1287.0, "completions/mean_length": 857.1666870117188, "completions/mean_terminated_length": 857.1666870117188, "completions/min_length": 549.0, "completions/min_terminated_length": 549.0, "epoch": 0.6170284938941656, "frac_reward_zero_std": 0.5, "grad_norm": 0.04477780684828758, "kl": 0.0, "learning_rate": 1.9496204278812974e-07, "loss": -0.0007, "num_tokens": 57396990.0, "reward": 0.7833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1819 }, { "completion_length": 1627.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5433.0, "completions/max_terminated_length": 5433.0, "completions/mean_length": 1627.75, "completions/mean_terminated_length": 1627.75, "completions/min_length": 744.0, "completions/min_terminated_length": 744.0, "epoch": 0.6173677069199457, "frac_reward_zero_std": 0.5, "grad_norm": 0.11882726103067398, "kl": 0.0, "learning_rate": 1.9478951000690132e-07, "loss": -0.0081, "num_tokens": 57425709.0, "reward": 1.25, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1820 }, { "completion_length": 2702.3333740234375, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 6490.0, "completions/mean_length": 3800.5, "completions/mean_terminated_length": 3242.800048828125, "completions/min_length": 745.0, "completions/min_terminated_length": 745.0, "epoch": 0.6177069199457259, "frac_reward_zero_std": 0.5, "grad_norm": 0.698258638381958, "kl": NaN, "learning_rate": 1.9461697722567287e-07, "loss": -0.0634, "num_tokens": 57466237.0, "reward": 0.8333333730697632, "reward_std": 0.24013882875442505, "rewards/correctness_reward_func/mean": 0.5833333134651184, "rewards/correctness_reward_func/std": 0.5149286389350891, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.11677484214305878, "step": 1821 }, { "completion_length": 1623.6666870117188, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5006.0, "completions/mean_length": 2172.75, "completions/mean_terminated_length": 1771.2728271484375, "completions/min_length": 399.0, "completions/min_terminated_length": 399.0, "epoch": 0.6180461329715061, "frac_reward_zero_std": 0.5, "grad_norm": 0.571277379989624, "kl": NaN, "learning_rate": 1.9444444444444445e-07, "loss": -0.0082, "num_tokens": 57494169.0, "reward": 0.7583334445953369, "reward_std": 0.22453653812408447, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.43029236793518066, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 1822 }, { "completion_length": 1864.75, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 5239.0, "completions/mean_length": 2962.916748046875, "completions/mean_terminated_length": 2237.699951171875, "completions/min_length": 519.0, "completions/min_terminated_length": 519.0, "epoch": 0.6183853459972863, "frac_reward_zero_std": 0.5, "grad_norm": 0.8144029378890991, "kl": NaN, "learning_rate": 1.94271911663216e-07, "loss": -0.0784, "num_tokens": 57527214.0, "reward": 0.9166667461395264, "reward_std": 0.2840188145637512, "rewards/correctness_reward_func/mean": 0.6666666865348816, "rewards/correctness_reward_func/std": 0.31139957904815674, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.11677484214305878, "step": 1823 }, { "completion_length": 519.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 988.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 519.6666870117188, "completions/mean_terminated_length": 519.6666870117188, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.6187245590230664, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.9409937888198758e-07, "loss": 0.0, "num_tokens": 57546734.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1824 }, { "completion_length": 958.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1294.0, "completions/max_terminated_length": 1294.0, "completions/mean_length": 958.4166870117188, "completions/mean_terminated_length": 958.4166870117188, "completions/min_length": 609.0, "completions/min_terminated_length": 609.0, "epoch": 0.6190637720488467, "frac_reward_zero_std": 0.5, "grad_norm": 0.11543634533882141, "kl": 0.0, "learning_rate": 1.9392684610075913e-07, "loss": -0.0009, "num_tokens": 57572029.0, "reward": 1.1500000953674316, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.8500000834465027, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1825 }, { "completion_length": 792.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1870.0, "completions/max_terminated_length": 1870.0, "completions/mean_length": 792.25, "completions/mean_terminated_length": 792.25, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.6194029850746269, "frac_reward_zero_std": 1.0, "grad_norm": 1.2875661070665956e-07, "kl": 0.0, "learning_rate": 1.937543133195307e-07, "loss": 0.0, "num_tokens": 57593836.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1826 }, { "completion_length": 1299.9167175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 3446.0, "completions/max_terminated_length": 3446.0, "completions/mean_length": 1299.916748046875, "completions/mean_terminated_length": 1299.916748046875, "completions/min_length": 424.0, "completions/min_terminated_length": 424.0, "epoch": 0.6197421981004071, "frac_reward_zero_std": 0.5, "grad_norm": 0.591293454170227, "kl": 0.0, "learning_rate": 1.9358178053830226e-07, "loss": -0.0019, "num_tokens": 57621117.0, "reward": 0.5499999523162842, "reward_std": 0.27386125922203064, "rewards/correctness_reward_func/mean": 0.25, "rewards/correctness_reward_func/std": 0.45226702094078064, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1827 }, { "completion_length": 3186.58349609375, "completions/clipped_ratio": 0.0, "completions/max_length": 4152.0, "completions/max_terminated_length": 4152.0, "completions/mean_length": 3186.58349609375, "completions/mean_terminated_length": 3186.58349609375, "completions/min_length": 2219.0, "completions/min_terminated_length": 2219.0, "epoch": 0.6200814111261872, "frac_reward_zero_std": 0.5, "grad_norm": 0.5670487880706787, "kl": 0.0, "learning_rate": 1.9340924775707384e-07, "loss": -0.0008, "num_tokens": 57669904.0, "reward": 0.5666667222976685, "reward_std": 0.20655912160873413, "rewards/correctness_reward_func/mean": 0.2666666805744171, "rewards/correctness_reward_func/std": 0.393892765045166, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1828 }, { "completion_length": 904.9166870117188, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 1347.0, "completions/mean_length": 1454.0, "completions/mean_terminated_length": 987.1818237304688, "completions/min_length": 728.0, "completions/min_terminated_length": 728.0, "epoch": 0.6204206241519674, "frac_reward_zero_std": 0.5, "grad_norm": 0.1923031359910965, "kl": NaN, "learning_rate": 1.932367149758454e-07, "loss": -0.0106, "num_tokens": 57692055.0, "reward": 1.1083333492279053, "reward_std": 0.22453658282756805, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.28069180250167847, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 1829 }, { "completion_length": 825.5833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1823.0, "completions/max_terminated_length": 1823.0, "completions/mean_length": 825.5833740234375, "completions/mean_terminated_length": 825.5833740234375, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.6207598371777476, "frac_reward_zero_std": 1.0, "grad_norm": 1.262523170453278e-07, "kl": 0.0, "learning_rate": 1.9306418219461698e-07, "loss": 0.0, "num_tokens": 57715978.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1830 }, { "completion_length": 926.0000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1267.0, "completions/max_terminated_length": 1267.0, "completions/mean_length": 926.0, "completions/mean_terminated_length": 926.0, "completions/min_length": 564.0, "completions/min_terminated_length": 564.0, "epoch": 0.6210990502035278, "frac_reward_zero_std": 0.0, "grad_norm": 0.11727043241262436, "kl": 0.0, "learning_rate": 1.9289164941338853e-07, "loss": 0.0002, "num_tokens": 57737398.0, "reward": 1.1666667461395264, "reward_std": 0.09559705853462219, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1831 }, { "completion_length": 2206.5001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 6413.0, "completions/max_terminated_length": 6413.0, "completions/mean_length": 2206.5, "completions/mean_terminated_length": 2206.5, "completions/min_length": 564.0, "completions/min_terminated_length": 564.0, "epoch": 0.621438263229308, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.9271911663216008e-07, "loss": 0.0, "num_tokens": 57777778.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1832 }, { "completion_length": 1996.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4326.0, "completions/max_terminated_length": 4326.0, "completions/mean_length": 1996.0833740234375, "completions/mean_terminated_length": 1996.0833740234375, "completions/min_length": 886.0, "completions/min_terminated_length": 886.0, "epoch": 0.6217774762550882, "frac_reward_zero_std": 0.0, "grad_norm": 0.1484280228614807, "kl": 0.0, "learning_rate": 1.9254658385093166e-07, "loss": 0.001, "num_tokens": 57812351.0, "reward": 1.183333396911621, "reward_std": 0.09246458858251572, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1833 }, { "completion_length": 1092.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 2785.0, "completions/max_terminated_length": 2785.0, "completions/mean_length": 1092.25, "completions/mean_terminated_length": 1092.25, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.6221166892808684, "frac_reward_zero_std": 0.5, "grad_norm": 0.364654541015625, "kl": 0.0, "learning_rate": 1.9237405106970321e-07, "loss": -0.0024, "num_tokens": 57836558.0, "reward": 1.066666603088379, "reward_std": 0.20655910670757294, "rewards/correctness_reward_func/mean": 0.7666666507720947, "rewards/correctness_reward_func/std": 0.3700941801071167, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1834 }, { "completion_length": 698.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1430.0, "completions/max_terminated_length": 1430.0, "completions/mean_length": 698.6666870117188, "completions/mean_terminated_length": 698.6666870117188, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.6224559023066486, "frac_reward_zero_std": 0.5, "grad_norm": 0.3588411211967468, "kl": 0.0, "learning_rate": 1.9220151828847482e-07, "loss": -0.001, "num_tokens": 57855160.0, "reward": 1.149999976158142, "reward_std": 0.17606817185878754, "rewards/correctness_reward_func/mean": 0.8499999642372131, "rewards/correctness_reward_func/std": 0.28444525599479675, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1835 }, { "completion_length": 1179.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2439.0, "completions/max_terminated_length": 2439.0, "completions/mean_length": 1179.5833740234375, "completions/mean_terminated_length": 1179.5833740234375, "completions/min_length": 394.0, "completions/min_terminated_length": 394.0, "epoch": 0.6227951153324288, "frac_reward_zero_std": 0.5, "grad_norm": 0.06520876288414001, "kl": 0.0, "learning_rate": 1.9202898550724637e-07, "loss": 0.0002, "num_tokens": 57880319.0, "reward": 0.7333333492279053, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.43333330750465393, "rewards/correctness_reward_func/std": 0.45792683959007263, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1836 }, { "completion_length": 1042.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 2474.0, "completions/max_terminated_length": 2474.0, "completions/mean_length": 1042.75, "completions/mean_terminated_length": 1042.75, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "epoch": 0.6231343283582089, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.9185645272601795e-07, "loss": 0.0, "num_tokens": 57907226.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1837 }, { "completion_length": 1205.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 2519.0, "completions/max_terminated_length": 2519.0, "completions/mean_length": 1205.3333740234375, "completions/mean_terminated_length": 1205.3333740234375, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "epoch": 0.6234735413839891, "frac_reward_zero_std": 0.5, "grad_norm": 0.49941253662109375, "kl": 0.0, "learning_rate": 1.916839199447895e-07, "loss": 0.0061, "num_tokens": 57936996.0, "reward": 0.7166666984558105, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.5149286985397339, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1838 }, { "completion_length": 2407.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 6005.0, "completions/max_terminated_length": 6005.0, "completions/mean_length": 2407.83349609375, "completions/mean_terminated_length": 2407.83349609375, "completions/min_length": 377.0, "completions/min_terminated_length": 377.0, "epoch": 0.6238127544097694, "frac_reward_zero_std": 1.0, "grad_norm": 1.869484975713931e-07, "kl": 0.0, "learning_rate": 1.9151138716356108e-07, "loss": 0.0, "num_tokens": 57977920.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1839 }, { "completion_length": 1450.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2365.0, "completions/max_terminated_length": 2365.0, "completions/mean_length": 1450.25, "completions/mean_terminated_length": 1450.25, "completions/min_length": 517.0, "completions/min_terminated_length": 517.0, "epoch": 0.6241519674355496, "frac_reward_zero_std": 1.0, "grad_norm": 1.0029268082689669e-07, "kl": 0.0, "learning_rate": 1.9133885438233264e-07, "loss": 0.0, "num_tokens": 58007599.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1840 }, { "completion_length": 1371.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2576.0, "completions/max_terminated_length": 2576.0, "completions/mean_length": 1371.8333740234375, "completions/mean_terminated_length": 1371.8333740234375, "completions/min_length": 779.0, "completions/min_terminated_length": 779.0, "epoch": 0.6244911804613297, "frac_reward_zero_std": 0.5, "grad_norm": 0.07739140093326569, "kl": 0.0, "learning_rate": 1.9116632160110422e-07, "loss": -0.0002, "num_tokens": 58038239.0, "reward": 1.133333444595337, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1841 }, { "completion_length": 771.5000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1097.0, "completions/max_terminated_length": 1097.0, "completions/mean_length": 771.5, "completions/mean_terminated_length": 771.5, "completions/min_length": 514.0, "completions/min_terminated_length": 514.0, "epoch": 0.6248303934871099, "frac_reward_zero_std": 0.5, "grad_norm": 0.07159554958343506, "kl": 0.0, "learning_rate": 1.9099378881987577e-07, "loss": -0.0008, "num_tokens": 58058969.0, "reward": 0.7666666507720947, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1842 }, { "completion_length": 2071.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4724.0, "completions/max_terminated_length": 4724.0, "completions/mean_length": 2071.166748046875, "completions/mean_terminated_length": 2071.166748046875, "completions/min_length": 860.0, "completions/min_terminated_length": 860.0, "epoch": 0.6251696065128901, "frac_reward_zero_std": 0.0, "grad_norm": 0.6721193790435791, "kl": 0.0, "learning_rate": 1.9082125603864732e-07, "loss": 0.0035, "num_tokens": 58096081.0, "reward": 0.7000000476837158, "reward_std": 0.41311824321746826, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1843 }, { "completion_length": 907.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1388.0, "completions/max_terminated_length": 1388.0, "completions/mean_length": 907.3333740234375, "completions/mean_terminated_length": 907.3333740234375, "completions/min_length": 409.0, "completions/min_terminated_length": 409.0, "epoch": 0.6255088195386703, "frac_reward_zero_std": 1.0, "grad_norm": 1.7644202898736694e-07, "kl": 0.0, "learning_rate": 1.906487232574189e-07, "loss": 0.0, "num_tokens": 58118879.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1844 }, { "completion_length": 528.5000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 915.0, "completions/max_terminated_length": 915.0, "completions/mean_length": 528.5, "completions/mean_terminated_length": 528.5, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.6258480325644504, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.9047619047619045e-07, "loss": 0.0, "num_tokens": 58131557.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1845 }, { "completion_length": 1440.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3743.0, "completions/max_terminated_length": 3743.0, "completions/mean_length": 1440.0833740234375, "completions/mean_terminated_length": 1440.0833740234375, "completions/min_length": 592.0, "completions/min_terminated_length": 592.0, "epoch": 0.6261872455902306, "frac_reward_zero_std": 0.5, "grad_norm": 0.07683519273996353, "kl": 0.0, "learning_rate": 1.9030365769496203e-07, "loss": -0.0002, "num_tokens": 58159896.0, "reward": 0.75, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.45000001788139343, "rewards/correctness_reward_func/std": 0.47577688097953796, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1846 }, { "completion_length": 779.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1564.0, "completions/max_terminated_length": 1564.0, "completions/mean_length": 779.4166870117188, "completions/mean_terminated_length": 779.4166870117188, "completions/min_length": 489.0, "completions/min_terminated_length": 489.0, "epoch": 0.6265264586160109, "frac_reward_zero_std": 1.0, "grad_norm": 1.0013142315301593e-07, "kl": 0.0, "learning_rate": 1.9013112491373359e-07, "loss": 0.0, "num_tokens": 58178675.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1847 }, { "completion_length": 3140.7501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 6577.0, "completions/max_terminated_length": 6577.0, "completions/mean_length": 3140.75, "completions/mean_terminated_length": 3140.75, "completions/min_length": 1045.0, "completions/min_terminated_length": 1045.0, "epoch": 0.6268656716417911, "frac_reward_zero_std": 0.0, "grad_norm": 0.804344654083252, "kl": 0.0, "learning_rate": 1.8995859213250517e-07, "loss": 0.0623, "num_tokens": 58229330.0, "reward": 0.6833333373069763, "reward_std": 0.48042041063308716, "rewards/correctness_reward_func/mean": 0.38333332538604736, "rewards/correctness_reward_func/std": 0.4783177673816681, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1848 }, { "completion_length": 3069.666748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5840.0, "completions/mean_length": 3618.75, "completions/mean_terminated_length": 3348.727294921875, "completions/min_length": 1750.0, "completions/min_terminated_length": 1750.0, "epoch": 0.6272048846675712, "frac_reward_zero_std": 0.0, "grad_norm": 0.19605781137943268, "kl": NaN, "learning_rate": 1.8978605935127672e-07, "loss": -0.0158, "num_tokens": 58277818.0, "reward": 0.7583333849906921, "reward_std": 0.10206204652786255, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 1849 }, { "completion_length": 730.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1392.0, "completions/max_terminated_length": 1392.0, "completions/mean_length": 730.75, "completions/mean_terminated_length": 730.75, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 0.6275440976933514, "frac_reward_zero_std": 1.0, "grad_norm": 1.7541657371111796e-07, "kl": 0.0, "learning_rate": 1.896135265700483e-07, "loss": 0.0, "num_tokens": 58295671.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1850 }, { "completion_length": 991.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1707.0, "completions/max_terminated_length": 1707.0, "completions/mean_length": 991.9166870117188, "completions/mean_terminated_length": 991.9166870117188, "completions/min_length": 624.0, "completions/min_terminated_length": 624.0, "epoch": 0.6278833107191316, "frac_reward_zero_std": 1.0, "grad_norm": 1.7590258494237787e-07, "kl": 0.0, "learning_rate": 1.8944099378881988e-07, "loss": 0.0, "num_tokens": 58322628.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1851 }, { "completion_length": 1128.6667175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 2164.0, "completions/max_terminated_length": 2164.0, "completions/mean_length": 1128.666748046875, "completions/mean_terminated_length": 1128.666748046875, "completions/min_length": 628.0, "completions/min_terminated_length": 628.0, "epoch": 0.6282225237449118, "frac_reward_zero_std": 0.0, "grad_norm": 0.5253918170928955, "kl": 0.0, "learning_rate": 1.8926846100759146e-07, "loss": -0.0093, "num_tokens": 58352084.0, "reward": 0.949999988079071, "reward_std": 0.29902368783950806, "rewards/correctness_reward_func/mean": 0.6500000357627869, "rewards/correctness_reward_func/std": 0.4833594858646393, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1852 }, { "completion_length": 1924.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3803.0, "completions/max_terminated_length": 3803.0, "completions/mean_length": 1924.25, "completions/mean_terminated_length": 1924.25, "completions/min_length": 824.0, "completions/min_terminated_length": 824.0, "epoch": 0.628561736770692, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.89095928226363e-07, "loss": 0.0, "num_tokens": 58388147.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1853 }, { "completion_length": 1066.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2139.0, "completions/max_terminated_length": 2139.0, "completions/mean_length": 1066.3333740234375, "completions/mean_terminated_length": 1066.3333740234375, "completions/min_length": 509.0, "completions/min_terminated_length": 509.0, "epoch": 0.6289009497964722, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.8892339544513456e-07, "loss": 0.0, "num_tokens": 58410807.0, "reward": 0.30000001192092896, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1854 }, { "completion_length": 2231.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4688.0, "completions/max_terminated_length": 4688.0, "completions/mean_length": 2231.666748046875, "completions/mean_terminated_length": 2231.666748046875, "completions/min_length": 1059.0, "completions/min_terminated_length": 1059.0, "epoch": 0.6292401628222524, "frac_reward_zero_std": 0.5, "grad_norm": 0.0669737160205841, "kl": 0.0, "learning_rate": 1.8875086266390614e-07, "loss": -0.0004, "num_tokens": 58449467.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1855 }, { "completion_length": 1980.9166870117188, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 6191.0, "completions/mean_length": 3079.08349609375, "completions/mean_terminated_length": 2377.10009765625, "completions/min_length": 668.0, "completions/min_terminated_length": 668.0, "epoch": 0.6295793758480326, "frac_reward_zero_std": 0.5, "grad_norm": 0.6720130443572998, "kl": NaN, "learning_rate": 1.885783298826777e-07, "loss": -0.0674, "num_tokens": 58483390.0, "reward": 0.7833334803581238, "reward_std": 0.2542964220046997, "rewards/correctness_reward_func/mean": 0.5333333611488342, "rewards/correctness_reward_func/std": 0.3938928246498108, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.11677484214305878, "step": 1856 }, { "completion_length": 1487.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3458.0, "completions/max_terminated_length": 3458.0, "completions/mean_length": 1487.0833740234375, "completions/mean_terminated_length": 1487.0833740234375, "completions/min_length": 871.0, "completions/min_terminated_length": 871.0, "epoch": 0.6299185888738128, "frac_reward_zero_std": 0.5, "grad_norm": 0.42474639415740967, "kl": 0.0, "learning_rate": 1.8840579710144927e-07, "loss": 0.0166, "num_tokens": 58512653.0, "reward": 1.0333335399627686, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.7333333492279053, "rewards/correctness_reward_func/std": 0.23094011843204498, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1857 }, { "completion_length": 1003.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1608.0, "completions/max_terminated_length": 1608.0, "completions/mean_length": 1003.0833740234375, "completions/mean_terminated_length": 1003.0833740234375, "completions/min_length": 567.0, "completions/min_terminated_length": 567.0, "epoch": 0.6302578018995929, "frac_reward_zero_std": 0.0, "grad_norm": 0.36979466676712036, "kl": 0.0, "learning_rate": 1.8823326432022083e-07, "loss": -0.0021, "num_tokens": 58538238.0, "reward": 0.9833333492279053, "reward_std": 0.2599138617515564, "rewards/correctness_reward_func/mean": 0.6833333373069763, "rewards/correctness_reward_func/std": 0.4217568039894104, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1858 }, { "completion_length": 662.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1098.0, "completions/max_terminated_length": 1098.0, "completions/mean_length": 662.3333740234375, "completions/mean_terminated_length": 662.3333740234375, "completions/min_length": 368.0, "completions/min_terminated_length": 368.0, "epoch": 0.6305970149253731, "frac_reward_zero_std": 0.5, "grad_norm": 0.08170771598815918, "kl": 0.0, "learning_rate": 1.880607315389924e-07, "loss": -0.0003, "num_tokens": 58561168.0, "reward": 0.7666666507720947, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1859 }, { "completion_length": 1165.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2840.0, "completions/max_terminated_length": 2840.0, "completions/mean_length": 1165.5833740234375, "completions/mean_terminated_length": 1165.5833740234375, "completions/min_length": 586.0, "completions/min_terminated_length": 586.0, "epoch": 0.6309362279511533, "frac_reward_zero_std": 1.0, "grad_norm": 1.9358810732228449e-07, "kl": 0.0, "learning_rate": 1.8788819875776396e-07, "loss": 0.0, "num_tokens": 58589309.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1860 }, { "completion_length": 418.4166717529297, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 858.0, "completions/mean_length": 967.5, "completions/mean_terminated_length": 456.4545593261719, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.6312754409769336, "frac_reward_zero_std": 0.5, "grad_norm": 0.19920192658901215, "kl": NaN, "learning_rate": 1.8771566597653554e-07, "loss": -0.0081, "num_tokens": 58607848.0, "reward": 0.6916667819023132, "reward_std": 0.26536136865615845, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.5149286985397339, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 1861 }, { "completion_length": 731.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1013.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 731.5833740234375, "completions/mean_terminated_length": 731.5833740234375, "completions/min_length": 531.0, "completions/min_terminated_length": 531.0, "epoch": 0.6316146540027137, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.875431331953071e-07, "loss": 0.0, "num_tokens": 58628087.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1862 }, { "completion_length": 1078.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2153.0, "completions/max_terminated_length": 2153.0, "completions/mean_length": 1078.916748046875, "completions/mean_terminated_length": 1078.916748046875, "completions/min_length": 470.0, "completions/min_terminated_length": 470.0, "epoch": 0.6319538670284939, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.8737060041407867e-07, "loss": 0.0, "num_tokens": 58654282.0, "reward": 0.30000001192092896, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1863 }, { "completion_length": 2359.58349609375, "completions/clipped_ratio": 0.0, "completions/max_length": 4139.0, "completions/max_terminated_length": 4139.0, "completions/mean_length": 2359.58349609375, "completions/mean_terminated_length": 2359.58349609375, "completions/min_length": 746.0, "completions/min_terminated_length": 746.0, "epoch": 0.6322930800542741, "frac_reward_zero_std": 0.0, "grad_norm": 0.381504088640213, "kl": 0.0, "learning_rate": 1.8719806763285022e-07, "loss": 0.0146, "num_tokens": 58697339.0, "reward": 1.0833333730697632, "reward_std": 0.21807155013084412, "rewards/correctness_reward_func/mean": 0.7833333015441895, "rewards/correctness_reward_func/std": 0.26227444410324097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1864 }, { "completion_length": 784.0833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1702.0, "completions/max_terminated_length": 1702.0, "completions/mean_length": 784.0833740234375, "completions/mean_terminated_length": 784.0833740234375, "completions/min_length": 470.0, "completions/min_terminated_length": 470.0, "epoch": 0.6326322930800543, "frac_reward_zero_std": 0.5, "grad_norm": 0.07380668818950653, "kl": 0.0, "learning_rate": 1.870255348516218e-07, "loss": -0.0007, "num_tokens": 58717806.0, "reward": 1.25, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1865 }, { "completion_length": 2078.2501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 4302.0, "completions/max_terminated_length": 4302.0, "completions/mean_length": 2078.25, "completions/mean_terminated_length": 2078.25, "completions/min_length": 712.0, "completions/min_terminated_length": 712.0, "epoch": 0.6329715061058344, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.8685300207039335e-07, "loss": 0.0, "num_tokens": 58752669.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1866 }, { "completion_length": 1385.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2889.0, "completions/max_terminated_length": 2889.0, "completions/mean_length": 1385.75, "completions/mean_terminated_length": 1385.75, "completions/min_length": 640.0, "completions/min_terminated_length": 640.0, "epoch": 0.6333107191316146, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.8668046928916493e-07, "loss": 0.0, "num_tokens": 58783662.0, "reward": 0.30000001192092896, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1867 }, { "completion_length": 1509.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3594.0, "completions/max_terminated_length": 3594.0, "completions/mean_length": 1509.0833740234375, "completions/mean_terminated_length": 1509.0833740234375, "completions/min_length": 499.0, "completions/min_terminated_length": 499.0, "epoch": 0.6336499321573948, "frac_reward_zero_std": 0.0, "grad_norm": 0.1501000076532364, "kl": 0.0, "learning_rate": 1.865079365079365e-07, "loss": -0.0017, "num_tokens": 58816255.0, "reward": 1.1666667461395264, "reward_std": 0.0955970510840416, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1868 }, { "completion_length": 832.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1872.0, "completions/max_terminated_length": 1872.0, "completions/mean_length": 832.0833740234375, "completions/mean_terminated_length": 832.0833740234375, "completions/min_length": 535.0, "completions/min_terminated_length": 535.0, "epoch": 0.6339891451831751, "frac_reward_zero_std": 1.0, "grad_norm": 7.66585301903433e-08, "kl": 0.0, "learning_rate": 1.8633540372670807e-07, "loss": 0.0, "num_tokens": 58839902.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1869 }, { "completion_length": 983.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1474.0, "completions/max_terminated_length": 1474.0, "completions/mean_length": 983.6666870117188, "completions/mean_terminated_length": 983.6666870117188, "completions/min_length": 489.0, "completions/min_terminated_length": 489.0, "epoch": 0.6343283582089553, "frac_reward_zero_std": 0.5, "grad_norm": 0.45892953872680664, "kl": 0.0, "learning_rate": 1.8616287094547965e-07, "loss": 0.0006, "num_tokens": 58861756.0, "reward": 0.6333333253860474, "reward_std": 0.25819891691207886, "rewards/correctness_reward_func/mean": 0.3333333432674408, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1870 }, { "completion_length": 2035.2501220703125, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5550.0, "completions/mean_length": 2584.33349609375, "completions/mean_terminated_length": 2220.272705078125, "completions/min_length": 467.0, "completions/min_terminated_length": 467.0, "epoch": 0.6346675712347354, "frac_reward_zero_std": 0.5, "grad_norm": 0.6276018023490906, "kl": NaN, "learning_rate": 1.859903381642512e-07, "loss": -0.0495, "num_tokens": 58897339.0, "reward": 0.4750000238418579, "reward_std": 0.2524876594543457, "rewards/correctness_reward_func/mean": 0.20000000298023224, "rewards/correctness_reward_func/std": 0.36181363463401794, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 1871 }, { "completion_length": 720.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1156.0, "completions/max_terminated_length": 1156.0, "completions/mean_length": 720.5833740234375, "completions/mean_terminated_length": 720.5833740234375, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 0.6350067842605156, "frac_reward_zero_std": 1.0, "grad_norm": 8.638304649366546e-08, "kl": 0.0, "learning_rate": 1.8581780538302278e-07, "loss": 0.0, "num_tokens": 58915136.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1872 }, { "completion_length": 644.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1599.0, "completions/max_terminated_length": 1599.0, "completions/mean_length": 644.25, "completions/mean_terminated_length": 644.25, "completions/min_length": 389.0, "completions/min_terminated_length": 389.0, "epoch": 0.6353459972862958, "frac_reward_zero_std": 0.5, "grad_norm": 0.2432372123003006, "kl": 0.0, "learning_rate": 1.8564527260179433e-07, "loss": -0.0045, "num_tokens": 58933295.0, "reward": 1.2166666984558105, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1873 }, { "completion_length": 686.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1034.0, "completions/max_terminated_length": 1034.0, "completions/mean_length": 686.0833740234375, "completions/mean_terminated_length": 686.0833740234375, "completions/min_length": 389.0, "completions/min_terminated_length": 389.0, "epoch": 0.635685210312076, "frac_reward_zero_std": 1.0, "grad_norm": 1.1722418236104204e-07, "kl": 0.0, "learning_rate": 1.854727398205659e-07, "loss": 0.0, "num_tokens": 58952904.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1874 }, { "completion_length": 1080.4167175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 1983.0, "completions/max_terminated_length": 1983.0, "completions/mean_length": 1080.416748046875, "completions/mean_terminated_length": 1080.416748046875, "completions/min_length": 537.0, "completions/min_terminated_length": 537.0, "epoch": 0.6360244233378561, "frac_reward_zero_std": 0.5, "grad_norm": 0.059044767171144485, "kl": 0.0, "learning_rate": 1.8530020703933746e-07, "loss": 0.0004, "num_tokens": 58978451.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1875 }, { "completion_length": 810.0000305175781, "completions/clipped_ratio": 0.41666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 3791.0, "completions/mean_length": 3555.416748046875, "completions/mean_terminated_length": 1388.571533203125, "completions/min_length": 931.0, "completions/min_terminated_length": 931.0, "epoch": 0.6363636363636364, "frac_reward_zero_std": 0.5, "grad_norm": 0.7442070841789246, "kl": NaN, "learning_rate": 1.8512767425810904e-07, "loss": -0.0519, "num_tokens": 59001575.0, "reward": 0.7583333849906921, "reward_std": 0.26536136865615845, "rewards/correctness_reward_func/mean": 0.5833333134651184, "rewards/correctness_reward_func/std": 0.5149286389350891, "rewards/format_reward_func/mean": 0.17499999701976776, "rewards/format_reward_func/std": 0.15447859466075897, "step": 1876 }, { "completion_length": 1292.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3425.0, "completions/max_terminated_length": 3425.0, "completions/mean_length": 1292.666748046875, "completions/mean_terminated_length": 1292.666748046875, "completions/min_length": 692.0, "completions/min_terminated_length": 692.0, "epoch": 0.6367028493894166, "frac_reward_zero_std": 0.5, "grad_norm": 0.39720451831817627, "kl": 0.0, "learning_rate": 1.849551414768806e-07, "loss": -0.0004, "num_tokens": 59025199.0, "reward": 1.1000001430511475, "reward_std": 0.19999998807907104, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.2696799635887146, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1877 }, { "completion_length": 769.5833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1060.0, "completions/max_terminated_length": 1060.0, "completions/mean_length": 769.5833740234375, "completions/mean_terminated_length": 769.5833740234375, "completions/min_length": 586.0, "completions/min_terminated_length": 586.0, "epoch": 0.6370420624151968, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.8478260869565215e-07, "loss": 0.0, "num_tokens": 59049806.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1878 }, { "completion_length": 535.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 820.0, "completions/max_terminated_length": 820.0, "completions/mean_length": 535.1666870117188, "completions/mean_terminated_length": 535.1666870117188, "completions/min_length": 418.0, "completions/min_terminated_length": 418.0, "epoch": 0.6373812754409769, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.8461007591442373e-07, "loss": 0.0, "num_tokens": 59066698.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1879 }, { "completion_length": 1052.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2015.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 1052.0, "completions/mean_terminated_length": 1052.0, "completions/min_length": 581.0, "completions/min_terminated_length": 581.0, "epoch": 0.6377204884667571, "frac_reward_zero_std": 0.5, "grad_norm": 0.44020894169807434, "kl": 0.0, "learning_rate": 1.8443754313319528e-07, "loss": 0.0049, "num_tokens": 59088376.0, "reward": 0.6666667461395264, "reward_std": 0.18618986010551453, "rewards/correctness_reward_func/mean": 0.36666667461395264, "rewards/correctness_reward_func/std": 0.45792683959007263, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1880 }, { "completion_length": 1299.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 2327.0, "completions/max_terminated_length": 2327.0, "completions/mean_length": 1299.916748046875, "completions/mean_terminated_length": 1299.916748046875, "completions/min_length": 773.0, "completions/min_terminated_length": 773.0, "epoch": 0.6380597014925373, "frac_reward_zero_std": 0.5, "grad_norm": 0.09209086745977402, "kl": 0.0, "learning_rate": 1.8426501035196686e-07, "loss": 0.0003, "num_tokens": 59117541.0, "reward": 1.1500000953674316, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.8500000834465027, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1881 }, { "completion_length": 807.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1194.0, "completions/max_terminated_length": 1194.0, "completions/mean_length": 807.4166870117188, "completions/mean_terminated_length": 807.4166870117188, "completions/min_length": 497.0, "completions/min_terminated_length": 497.0, "epoch": 0.6383989145183175, "frac_reward_zero_std": 1.0, "grad_norm": 2.2377925290584244e-07, "kl": 0.0, "learning_rate": 1.8409247757073844e-07, "loss": 0.0, "num_tokens": 59138918.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1882 }, { "completion_length": 1487.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3056.0, "completions/max_terminated_length": 3056.0, "completions/mean_length": 1487.5833740234375, "completions/mean_terminated_length": 1487.5833740234375, "completions/min_length": 695.0, "completions/min_terminated_length": 695.0, "epoch": 0.6387381275440976, "frac_reward_zero_std": 1.0, "grad_norm": 1.2361817880446324e-07, "kl": 0.0, "learning_rate": 1.8391994478951002e-07, "loss": 0.0, "num_tokens": 59166399.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1883 }, { "completion_length": 1158.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 1960.0, "completions/max_terminated_length": 1960.0, "completions/mean_length": 1158.0, "completions/mean_terminated_length": 1158.0, "completions/min_length": 564.0, "completions/min_terminated_length": 564.0, "epoch": 0.6390773405698779, "frac_reward_zero_std": 0.5, "grad_norm": 0.09400633722543716, "kl": 0.0, "learning_rate": 1.8374741200828157e-07, "loss": -0.0018, "num_tokens": 59191029.0, "reward": 1.25, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1884 }, { "completion_length": 955.0833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 3360.0, "completions/max_terminated_length": 3360.0, "completions/mean_length": 955.0833740234375, "completions/mean_terminated_length": 955.0833740234375, "completions/min_length": 470.0, "completions/min_terminated_length": 470.0, "epoch": 0.6394165535956581, "frac_reward_zero_std": 0.5, "grad_norm": 0.0996096059679985, "kl": 0.0, "learning_rate": 1.8357487922705315e-07, "loss": 0.0026, "num_tokens": 59214286.0, "reward": 1.133333444595337, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1885 }, { "completion_length": 1315.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2575.0, "completions/max_terminated_length": 2575.0, "completions/mean_length": 1315.5, "completions/mean_terminated_length": 1315.5, "completions/min_length": 532.0, "completions/min_terminated_length": 532.0, "epoch": 0.6397557666214383, "frac_reward_zero_std": 0.5, "grad_norm": 0.058993469923734665, "kl": 0.0, "learning_rate": 1.834023464458247e-07, "loss": 0.0, "num_tokens": 59238268.0, "reward": 1.2333333492279053, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.9333333373069763, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1886 }, { "completion_length": 960.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2076.0, "completions/max_terminated_length": 2076.0, "completions/mean_length": 960.5, "completions/mean_terminated_length": 960.5, "completions/min_length": 630.0, "completions/min_terminated_length": 630.0, "epoch": 0.6400949796472184, "frac_reward_zero_std": 0.5, "grad_norm": 0.07736815512180328, "kl": 0.0, "learning_rate": 1.8322981366459628e-07, "loss": -0.0021, "num_tokens": 59259280.0, "reward": 1.2166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298573225736618, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1887 }, { "completion_length": 874.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 2513.0, "completions/max_terminated_length": 2513.0, "completions/mean_length": 874.3333740234375, "completions/mean_terminated_length": 874.3333740234375, "completions/min_length": 472.0, "completions/min_terminated_length": 472.0, "epoch": 0.6404341926729986, "frac_reward_zero_std": 1.0, "grad_norm": 1.5926157459489332e-07, "kl": 0.0, "learning_rate": 1.8305728088336783e-07, "loss": 0.0, "num_tokens": 59283080.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1888 }, { "completion_length": 1786.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5626.0, "completions/max_terminated_length": 5626.0, "completions/mean_length": 1786.25, "completions/mean_terminated_length": 1786.25, "completions/min_length": 533.0, "completions/min_terminated_length": 533.0, "epoch": 0.6407734056987788, "frac_reward_zero_std": 1.0, "grad_norm": 2.7361249976820545e-07, "kl": 0.0, "learning_rate": 1.828847481021394e-07, "loss": 0.0, "num_tokens": 59315843.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1889 }, { "completion_length": 575.5000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 938.0, "completions/max_terminated_length": 938.0, "completions/mean_length": 575.5, "completions/mean_terminated_length": 575.5, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 0.641112618724559, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.8271221532091097e-07, "loss": 0.0, "num_tokens": 59332061.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1890 }, { "completion_length": 703.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1084.0, "completions/max_terminated_length": 1084.0, "completions/mean_length": 703.3333740234375, "completions/mean_terminated_length": 703.3333740234375, "completions/min_length": 469.0, "completions/min_terminated_length": 469.0, "epoch": 0.6414518317503393, "frac_reward_zero_std": 0.5, "grad_norm": 0.07110600918531418, "kl": 0.0, "learning_rate": 1.8253968253968252e-07, "loss": 0.0003, "num_tokens": 59352165.0, "reward": 1.2333333492279053, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.9333333373069763, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1891 }, { "completion_length": 654.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 999.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 654.9166870117188, "completions/mean_terminated_length": 654.9166870117188, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.6417910447761194, "frac_reward_zero_std": 0.5, "grad_norm": 0.06704479455947876, "kl": 0.0, "learning_rate": 1.823671497584541e-07, "loss": 0.0002, "num_tokens": 59370248.0, "reward": 0.7166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1892 }, { "completion_length": 1287.9166870117188, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 3263.0, "completions/mean_length": 1837.0, "completions/mean_terminated_length": 1405.0, "completions/min_length": 597.0, "completions/min_terminated_length": 597.0, "epoch": 0.6421302578018996, "frac_reward_zero_std": 0.5, "grad_norm": 0.07492797821760178, "kl": NaN, "learning_rate": 1.8219461697722565e-07, "loss": 0.0006, "num_tokens": 59394031.0, "reward": 1.133333444595337, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1893 }, { "completion_length": 2329.5001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 5974.0, "completions/max_terminated_length": 5974.0, "completions/mean_length": 2329.5, "completions/mean_terminated_length": 2329.5, "completions/min_length": 697.0, "completions/min_terminated_length": 697.0, "epoch": 0.6424694708276798, "frac_reward_zero_std": 0.5, "grad_norm": 0.42830121517181396, "kl": 0.0, "learning_rate": 1.8202208419599723e-07, "loss": 0.0096, "num_tokens": 59438761.0, "reward": 0.7666667699813843, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1894 }, { "completion_length": 959.0833587646484, "completions/clipped_ratio": 0.41666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 6303.0, "completions/mean_length": 3704.5, "completions/mean_terminated_length": 1644.1429443359375, "completions/min_length": 551.0, "completions/min_terminated_length": 551.0, "epoch": 0.64280868385346, "frac_reward_zero_std": 0.5, "grad_norm": 0.6374104619026184, "kl": NaN, "learning_rate": 1.8184955141476878e-07, "loss": -0.0731, "num_tokens": 59462438.0, "reward": 0.7416666746139526, "reward_std": 0.22453658282756805, "rewards/correctness_reward_func/mean": 0.5666666626930237, "rewards/correctness_reward_func/std": 0.5033223032951355, "rewards/format_reward_func/mean": 0.17500001192092896, "rewards/format_reward_func/std": 0.15447859466075897, "step": 1895 }, { "completion_length": 694.5000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1248.0, "completions/max_terminated_length": 1248.0, "completions/mean_length": 694.5, "completions/mean_terminated_length": 694.5, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 0.6431478968792401, "frac_reward_zero_std": 0.0, "grad_norm": 0.11773252487182617, "kl": 0.0, "learning_rate": 1.8167701863354036e-07, "loss": 0.0018, "num_tokens": 59482982.0, "reward": 1.1666667461395264, "reward_std": 0.10327953100204468, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1896 }, { "completion_length": 1409.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3373.0, "completions/max_terminated_length": 3373.0, "completions/mean_length": 1409.75, "completions/mean_terminated_length": 1409.75, "completions/min_length": 454.0, "completions/min_terminated_length": 454.0, "epoch": 0.6434871099050203, "frac_reward_zero_std": 0.5, "grad_norm": 0.09757261723279953, "kl": 0.0, "learning_rate": 1.8150448585231192e-07, "loss": 0.0006, "num_tokens": 59514755.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1897 }, { "completion_length": 1794.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4476.0, "completions/max_terminated_length": 4476.0, "completions/mean_length": 1794.5833740234375, "completions/mean_terminated_length": 1794.5833740234375, "completions/min_length": 633.0, "completions/min_terminated_length": 633.0, "epoch": 0.6438263229308006, "frac_reward_zero_std": 1.0, "grad_norm": 2.9011417268520745e-07, "kl": 0.0, "learning_rate": 1.8133195307108352e-07, "loss": 0.0, "num_tokens": 59548302.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1898 }, { "completion_length": 739.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 997.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 739.1666870117188, "completions/mean_terminated_length": 739.1666870117188, "completions/min_length": 394.0, "completions/min_terminated_length": 394.0, "epoch": 0.6441655359565808, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.8115942028985507e-07, "loss": 0.0, "num_tokens": 59572430.0, "reward": 0.30000001192092896, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1899 }, { "completion_length": 1891.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3228.0, "completions/max_terminated_length": 3228.0, "completions/mean_length": 1891.166748046875, "completions/mean_terminated_length": 1891.166748046875, "completions/min_length": 1166.0, "completions/min_terminated_length": 1166.0, "epoch": 0.6445047489823609, "frac_reward_zero_std": 1.0, "grad_norm": 2.0028788583204005e-07, "kl": 0.0, "learning_rate": 1.8098688750862663e-07, "loss": 0.0, "num_tokens": 59603422.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1900 }, { "completion_length": 618.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1098.0, "completions/max_terminated_length": 1098.0, "completions/mean_length": 618.25, "completions/mean_terminated_length": 618.25, "completions/min_length": 431.0, "completions/min_terminated_length": 431.0, "epoch": 0.6448439620081411, "frac_reward_zero_std": 0.5, "grad_norm": 0.0646485686302185, "kl": 0.0, "learning_rate": 1.808143547273982e-07, "loss": -0.0002, "num_tokens": 59618839.0, "reward": 0.7833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1901 }, { "completion_length": 1359.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 5106.0, "completions/max_terminated_length": 5106.0, "completions/mean_length": 1359.0833740234375, "completions/mean_terminated_length": 1359.0833740234375, "completions/min_length": 495.0, "completions/min_terminated_length": 495.0, "epoch": 0.6451831750339213, "frac_reward_zero_std": 0.5, "grad_norm": 0.13194362819194794, "kl": 0.0, "learning_rate": 1.8064182194616976e-07, "loss": 0.0064, "num_tokens": 59645192.0, "reward": 1.25, "reward_std": 0.05477222427725792, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1902 }, { "completion_length": 694.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1279.0, "completions/max_terminated_length": 1279.0, "completions/mean_length": 694.4166870117188, "completions/mean_terminated_length": 694.4166870117188, "completions/min_length": 449.0, "completions/min_terminated_length": 449.0, "epoch": 0.6455223880597015, "frac_reward_zero_std": 0.5, "grad_norm": 0.15142489969730377, "kl": 0.0, "learning_rate": 1.8046928916494134e-07, "loss": 0.0, "num_tokens": 59664115.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1903 }, { "completion_length": 859.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1758.0, "completions/max_terminated_length": 1758.0, "completions/mean_length": 859.3333740234375, "completions/mean_terminated_length": 859.3333740234375, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "epoch": 0.6458616010854816, "frac_reward_zero_std": 1.0, "grad_norm": 1.1864783289183833e-07, "kl": 0.0, "learning_rate": 1.802967563837129e-07, "loss": 0.0, "num_tokens": 59684429.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1904 }, { "completion_length": 1856.2501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 5585.0, "completions/max_terminated_length": 5585.0, "completions/mean_length": 1856.25, "completions/mean_terminated_length": 1856.25, "completions/min_length": 506.0, "completions/min_terminated_length": 506.0, "epoch": 0.6462008141112618, "frac_reward_zero_std": 1.0, "grad_norm": 2.1230609092981467e-07, "kl": 0.0, "learning_rate": 1.8012422360248447e-07, "loss": 0.0, "num_tokens": 59721914.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1905 }, { "completion_length": 1405.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3020.0, "completions/max_terminated_length": 3020.0, "completions/mean_length": 1405.5833740234375, "completions/mean_terminated_length": 1405.5833740234375, "completions/min_length": 728.0, "completions/min_terminated_length": 728.0, "epoch": 0.6465400271370421, "frac_reward_zero_std": 1.0, "grad_norm": 1.5573637313082145e-07, "kl": 0.0, "learning_rate": 1.7995169082125602e-07, "loss": 0.0, "num_tokens": 59746389.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1906 }, { "completion_length": 1230.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4024.0, "completions/max_terminated_length": 4024.0, "completions/mean_length": 1230.3333740234375, "completions/mean_terminated_length": 1230.3333740234375, "completions/min_length": 653.0, "completions/min_terminated_length": 653.0, "epoch": 0.6468792401628223, "frac_reward_zero_std": 0.5, "grad_norm": 0.09207607805728912, "kl": 0.0, "learning_rate": 1.797791580400276e-07, "loss": 0.0025, "num_tokens": 59769223.0, "reward": 1.2333333492279053, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.9333333373069763, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1907 }, { "completion_length": 2496.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 5923.0, "completions/max_terminated_length": 5923.0, "completions/mean_length": 2496.33349609375, "completions/mean_terminated_length": 2496.33349609375, "completions/min_length": 749.0, "completions/min_terminated_length": 749.0, "epoch": 0.6472184531886025, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.7960662525879916e-07, "loss": 0.0, "num_tokens": 59811989.0, "reward": 0.30000001192092896, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1908 }, { "completion_length": 1220.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 1840.0, "completions/max_terminated_length": 1840.0, "completions/mean_length": 1220.5, "completions/mean_terminated_length": 1220.5, "completions/min_length": 645.0, "completions/min_terminated_length": 645.0, "epoch": 0.6475576662143826, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.7943409247757073e-07, "loss": 0.0, "num_tokens": 59841155.0, "reward": 0.30000001192092896, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1909 }, { "completion_length": 1401.7500610351562, "completions/clipped_ratio": 0.33333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 2667.0, "completions/mean_length": 3598.08349609375, "completions/mean_terminated_length": 2102.625, "completions/min_length": 1442.0, "completions/min_terminated_length": 1442.0, "epoch": 0.6478968792401628, "frac_reward_zero_std": 0.5, "grad_norm": 0.5642118453979492, "kl": NaN, "learning_rate": 1.792615596963423e-07, "loss": -0.0514, "num_tokens": 59866934.0, "reward": 0.3500000238418579, "reward_std": 0.31144821643829346, "rewards/correctness_reward_func/mean": 0.14999999105930328, "rewards/correctness_reward_func/std": 0.35291001200675964, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.14770980179309845, "step": 1910 }, { "completion_length": 2052.0000610351562, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 5180.0, "completions/mean_length": 3150.166748046875, "completions/mean_terminated_length": 2462.400146484375, "completions/min_length": 868.0, "completions/min_terminated_length": 868.0, "epoch": 0.648236092265943, "frac_reward_zero_std": 0.5, "grad_norm": 0.5336740612983704, "kl": NaN, "learning_rate": 1.7908902691511384e-07, "loss": -0.0275, "num_tokens": 59900912.0, "reward": 0.7333334684371948, "reward_std": 0.24013882875442505, "rewards/correctness_reward_func/mean": 0.4833333194255829, "rewards/correctness_reward_func/std": 0.43029236793518066, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.11677484214305878, "step": 1911 }, { "completion_length": 967.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 1452.0, "completions/max_terminated_length": 1452.0, "completions/mean_length": 967.0, "completions/mean_terminated_length": 967.0, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.6485753052917232, "frac_reward_zero_std": 0.5, "grad_norm": 0.11863810569047928, "kl": 0.0, "learning_rate": 1.7891649413388542e-07, "loss": -0.001, "num_tokens": 59920988.0, "reward": 1.133333444595337, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1912 }, { "completion_length": 1407.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2942.0, "completions/max_terminated_length": 2942.0, "completions/mean_length": 1407.3333740234375, "completions/mean_terminated_length": 1407.3333740234375, "completions/min_length": 611.0, "completions/min_terminated_length": 611.0, "epoch": 0.6489145183175034, "frac_reward_zero_std": 0.5, "grad_norm": 0.06550927460193634, "kl": 0.0, "learning_rate": 1.7874396135265697e-07, "loss": 0.0018, "num_tokens": 59952306.0, "reward": 0.7166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1913 }, { "completion_length": 842.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1520.0, "completions/max_terminated_length": 1520.0, "completions/mean_length": 842.8333740234375, "completions/mean_terminated_length": 842.8333740234375, "completions/min_length": 370.0, "completions/min_terminated_length": 370.0, "epoch": 0.6492537313432836, "frac_reward_zero_std": 0.5, "grad_norm": 0.06949958205223083, "kl": 0.0, "learning_rate": 1.7857142857142858e-07, "loss": -0.0002, "num_tokens": 59974966.0, "reward": 0.7666666507720947, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1914 }, { "completion_length": 771.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1352.0, "completions/max_terminated_length": 1352.0, "completions/mean_length": 771.4166870117188, "completions/mean_terminated_length": 771.4166870117188, "completions/min_length": 469.0, "completions/min_terminated_length": 469.0, "epoch": 0.6495929443690638, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.7839889579020013e-07, "loss": 0.0, "num_tokens": 59995587.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1915 }, { "completion_length": 890.0000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1166.0, "completions/max_terminated_length": 1166.0, "completions/mean_length": 890.0, "completions/mean_terminated_length": 890.0, "completions/min_length": 558.0, "completions/min_terminated_length": 558.0, "epoch": 0.649932157394844, "frac_reward_zero_std": 1.0, "grad_norm": 1.7561757204020978e-07, "kl": 0.0, "learning_rate": 1.782263630089717e-07, "loss": 0.0, "num_tokens": 60015105.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1916 }, { "completion_length": 682.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1166.0, "completions/max_terminated_length": 1166.0, "completions/mean_length": 682.3333740234375, "completions/mean_terminated_length": 682.3333740234375, "completions/min_length": 431.0, "completions/min_terminated_length": 431.0, "epoch": 0.6502713704206241, "frac_reward_zero_std": 1.0, "grad_norm": 1.4465112485595455e-07, "kl": 0.0, "learning_rate": 1.7805383022774326e-07, "loss": 0.0, "num_tokens": 60032119.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1917 }, { "completion_length": 736.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1280.0, "completions/max_terminated_length": 1280.0, "completions/mean_length": 736.0, "completions/mean_terminated_length": 736.0, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 0.6506105834464043, "frac_reward_zero_std": 0.5, "grad_norm": 0.09809959679841995, "kl": 0.0, "learning_rate": 1.7788129744651484e-07, "loss": -0.0012, "num_tokens": 60052759.0, "reward": 0.7333333492279053, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.43333330750465393, "rewards/correctness_reward_func/std": 0.45792683959007263, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1918 }, { "completion_length": 1051.6667175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 1830.0, "completions/max_terminated_length": 1830.0, "completions/mean_length": 1051.666748046875, "completions/mean_terminated_length": 1051.666748046875, "completions/min_length": 610.0, "completions/min_terminated_length": 610.0, "epoch": 0.6509497964721845, "frac_reward_zero_std": 0.5, "grad_norm": 0.08397035300731659, "kl": 0.0, "learning_rate": 1.777087646652864e-07, "loss": -0.0013, "num_tokens": 60077229.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1919 }, { "completion_length": 2681.2501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 6152.0, "completions/max_terminated_length": 6152.0, "completions/mean_length": 2681.25, "completions/mean_terminated_length": 2681.25, "completions/min_length": 1120.0, "completions/min_terminated_length": 1120.0, "epoch": 0.6512890094979648, "frac_reward_zero_std": 1.0, "grad_norm": 1.5928553409594315e-07, "kl": 0.0, "learning_rate": 1.7753623188405797e-07, "loss": 0.0, "num_tokens": 60124272.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1920 }, { "completion_length": 1827.0833740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4476.0, "completions/mean_length": 2376.166748046875, "completions/mean_terminated_length": 1993.181884765625, "completions/min_length": 460.0, "completions/min_terminated_length": 460.0, "epoch": 0.6516282225237449, "frac_reward_zero_std": 0.5, "grad_norm": 0.07494795322418213, "kl": NaN, "learning_rate": 1.7736369910282953e-07, "loss": -0.011, "num_tokens": 60158521.0, "reward": 0.7749999761581421, "reward_std": 0.06123724579811096, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 1921 }, { "completion_length": 1912.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4249.0, "completions/max_terminated_length": 4249.0, "completions/mean_length": 1912.166748046875, "completions/mean_terminated_length": 1912.166748046875, "completions/min_length": 632.0, "completions/min_terminated_length": 632.0, "epoch": 0.6519674355495251, "frac_reward_zero_std": 1.0, "grad_norm": 1.7009043062898854e-07, "kl": 0.0, "learning_rate": 1.771911663216011e-07, "loss": 0.0, "num_tokens": 60193059.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1922 }, { "completion_length": 1308.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2247.0, "completions/max_terminated_length": 2247.0, "completions/mean_length": 1308.75, "completions/mean_terminated_length": 1308.75, "completions/min_length": 444.0, "completions/min_terminated_length": 444.0, "epoch": 0.6523066485753053, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.7701863354037266e-07, "loss": 0.0, "num_tokens": 60217410.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1923 }, { "completion_length": 676.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1236.0, "completions/max_terminated_length": 1236.0, "completions/mean_length": 676.25, "completions/mean_terminated_length": 676.25, "completions/min_length": 378.0, "completions/min_terminated_length": 378.0, "epoch": 0.6526458616010855, "frac_reward_zero_std": 1.0, "grad_norm": 9.324396899046405e-08, "kl": 0.0, "learning_rate": 1.768461007591442e-07, "loss": 0.0, "num_tokens": 60239937.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1924 }, { "completion_length": 715.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1283.0, "completions/max_terminated_length": 1283.0, "completions/mean_length": 715.6666870117188, "completions/mean_terminated_length": 715.6666870117188, "completions/min_length": 449.0, "completions/min_terminated_length": 449.0, "epoch": 0.6529850746268657, "frac_reward_zero_std": 0.0, "grad_norm": 0.07660546898841858, "kl": 0.0, "learning_rate": 1.766735679779158e-07, "loss": -0.0009, "num_tokens": 60259415.0, "reward": 1.2000000476837158, "reward_std": 0.08164963126182556, "rewards/correctness_reward_func/mean": 0.9000000357627869, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1925 }, { "completion_length": 2130.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3973.0, "completions/max_terminated_length": 3973.0, "completions/mean_length": 2130.83349609375, "completions/mean_terminated_length": 2130.83349609375, "completions/min_length": 1259.0, "completions/min_terminated_length": 1259.0, "epoch": 0.6533242876526458, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.7650103519668734e-07, "loss": 0.0, "num_tokens": 60296055.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1926 }, { "completion_length": 1860.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5086.0, "completions/max_terminated_length": 5086.0, "completions/mean_length": 1860.166748046875, "completions/mean_terminated_length": 1860.166748046875, "completions/min_length": 512.0, "completions/min_terminated_length": 512.0, "epoch": 0.653663500678426, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.7632850241545892e-07, "loss": 0.0, "num_tokens": 60332699.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1927 }, { "completion_length": 1343.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1891.0, "completions/max_terminated_length": 1891.0, "completions/mean_length": 1343.416748046875, "completions/mean_terminated_length": 1343.416748046875, "completions/min_length": 976.0, "completions/min_terminated_length": 976.0, "epoch": 0.6540027137042063, "frac_reward_zero_std": 0.5, "grad_norm": 0.07084862142801285, "kl": 0.0, "learning_rate": 1.7615596963423048e-07, "loss": -0.0003, "num_tokens": 60359098.0, "reward": 0.7833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1928 }, { "completion_length": 491.0833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 765.0, "completions/max_terminated_length": 765.0, "completions/mean_length": 491.0833435058594, "completions/mean_terminated_length": 491.0833435058594, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 0.6543419267299865, "frac_reward_zero_std": 1.0, "grad_norm": 1.0508670555964272e-07, "kl": 0.0, "learning_rate": 1.7598343685300208e-07, "loss": 0.0, "num_tokens": 60379199.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1929 }, { "completion_length": 877.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1989.0, "completions/max_terminated_length": 1989.0, "completions/mean_length": 877.9166870117188, "completions/mean_terminated_length": 877.9166870117188, "completions/min_length": 412.0, "completions/min_terminated_length": 412.0, "epoch": 0.6546811397557666, "frac_reward_zero_std": 0.5, "grad_norm": 0.07653743773698807, "kl": 0.0, "learning_rate": 1.7581090407177364e-07, "loss": 0.0002, "num_tokens": 60403414.0, "reward": 0.7333333492279053, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.43333330750465393, "rewards/correctness_reward_func/std": 0.45792683959007263, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1930 }, { "completion_length": 828.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1705.0, "completions/max_terminated_length": 1705.0, "completions/mean_length": 828.8333740234375, "completions/mean_terminated_length": 828.8333740234375, "completions/min_length": 490.0, "completions/min_terminated_length": 490.0, "epoch": 0.6550203527815468, "frac_reward_zero_std": 1.0, "grad_norm": 1.2056402454163617e-07, "kl": 0.0, "learning_rate": 1.7563837129054521e-07, "loss": 0.0, "num_tokens": 60420644.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1931 }, { "completion_length": 736.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1324.0, "completions/max_terminated_length": 1324.0, "completions/mean_length": 736.6666870117188, "completions/mean_terminated_length": 736.6666870117188, "completions/min_length": 441.0, "completions/min_terminated_length": 441.0, "epoch": 0.655359565807327, "frac_reward_zero_std": 1.0, "grad_norm": 8.723392141973818e-08, "kl": 0.0, "learning_rate": 1.7546583850931677e-07, "loss": 0.0, "num_tokens": 60442210.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1932 }, { "completion_length": 560.5000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 805.0, "completions/max_terminated_length": 805.0, "completions/mean_length": 560.5, "completions/mean_terminated_length": 560.5, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "epoch": 0.6556987788331072, "frac_reward_zero_std": 0.5, "grad_norm": 0.03854304552078247, "kl": 0.0, "learning_rate": 1.7529330572808835e-07, "loss": -0.0, "num_tokens": 60460846.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1933 }, { "completion_length": 762.9166870117188, "completions/clipped_ratio": 0.33333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 2846.0, "completions/mean_length": 2959.25, "completions/mean_terminated_length": 1144.375, "completions/min_length": 485.0, "completions/min_terminated_length": 485.0, "epoch": 0.6560379918588873, "frac_reward_zero_std": 0.0, "grad_norm": 0.6417205333709717, "kl": NaN, "learning_rate": 1.751207729468599e-07, "loss": -0.045, "num_tokens": 60484779.0, "reward": 0.8166667819023132, "reward_std": 0.3630880117416382, "rewards/correctness_reward_func/mean": 0.6166666746139526, "rewards/correctness_reward_func/std": 0.4628632962703705, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.14770980179309845, "step": 1934 }, { "completion_length": 920.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 4016.0, "completions/max_terminated_length": 4016.0, "completions/mean_length": 920.6666870117188, "completions/mean_terminated_length": 920.6666870117188, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.6563772048846676, "frac_reward_zero_std": 0.5, "grad_norm": 0.5792922377586365, "kl": 0.0, "learning_rate": 1.7494824016563145e-07, "loss": 0.0277, "num_tokens": 60509927.0, "reward": 1.1666667461395264, "reward_std": 0.18618986010551453, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.287096232175827, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1935 }, { "completion_length": 760.5000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 945.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 760.5, "completions/mean_terminated_length": 760.5, "completions/min_length": 649.0, "completions/min_terminated_length": 649.0, "epoch": 0.6567164179104478, "frac_reward_zero_std": 1.0, "grad_norm": 1.0449276999224821e-07, "kl": 0.0, "learning_rate": 1.7477570738440303e-07, "loss": 0.0, "num_tokens": 60531653.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1936 }, { "completion_length": 985.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1796.0, "completions/max_terminated_length": 1796.0, "completions/mean_length": 985.8333740234375, "completions/mean_terminated_length": 985.8333740234375, "completions/min_length": 514.0, "completions/min_terminated_length": 514.0, "epoch": 0.657055630936228, "frac_reward_zero_std": 0.5, "grad_norm": 0.2661232650279999, "kl": 0.0, "learning_rate": 1.7460317460317458e-07, "loss": 0.0029, "num_tokens": 60556623.0, "reward": 0.36666667461395264, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.06666667014360428, "rewards/correctness_reward_func/std": 0.23094011843204498, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1937 }, { "completion_length": 1633.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 4016.0, "completions/max_terminated_length": 4016.0, "completions/mean_length": 1633.75, "completions/mean_terminated_length": 1633.75, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "epoch": 0.6573948439620081, "frac_reward_zero_std": 0.5, "grad_norm": 0.13186636567115784, "kl": 0.0, "learning_rate": 1.7443064182194616e-07, "loss": -0.0019, "num_tokens": 60588690.0, "reward": 1.25, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1938 }, { "completion_length": 667.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1343.0, "completions/max_terminated_length": 1343.0, "completions/mean_length": 667.3333740234375, "completions/mean_terminated_length": 667.3333740234375, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 0.6577340569877883, "frac_reward_zero_std": 1.0, "grad_norm": 1.4528134784086433e-07, "kl": 0.0, "learning_rate": 1.7425810904071772e-07, "loss": 0.0, "num_tokens": 60606106.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1939 }, { "completion_length": 1390.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 4716.0, "completions/max_terminated_length": 4716.0, "completions/mean_length": 1390.916748046875, "completions/mean_terminated_length": 1390.916748046875, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "epoch": 0.6580732700135685, "frac_reward_zero_std": 1.0, "grad_norm": 7.854256978134799e-08, "kl": 0.0, "learning_rate": 1.740855762594893e-07, "loss": 0.0, "num_tokens": 60635103.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1940 }, { "completion_length": 1324.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2500.0, "completions/max_terminated_length": 2500.0, "completions/mean_length": 1324.5, "completions/mean_terminated_length": 1324.5, "completions/min_length": 608.0, "completions/min_terminated_length": 608.0, "epoch": 0.6584124830393487, "frac_reward_zero_std": 0.5, "grad_norm": 0.6688001751899719, "kl": 0.0, "learning_rate": 1.7391304347826085e-07, "loss": 0.0112, "num_tokens": 60664689.0, "reward": 1.133333444595337, "reward_std": 0.25819891691207886, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.3892494738101959, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1941 }, { "completion_length": 1475.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2275.0, "completions/max_terminated_length": 2275.0, "completions/mean_length": 1475.166748046875, "completions/mean_terminated_length": 1475.166748046875, "completions/min_length": 764.0, "completions/min_terminated_length": 764.0, "epoch": 0.658751696065129, "frac_reward_zero_std": 0.0, "grad_norm": 0.12396968901157379, "kl": 0.0, "learning_rate": 1.7374051069703243e-07, "loss": -0.003, "num_tokens": 60697727.0, "reward": 1.1500000953674316, "reward_std": 0.09246458858251572, "rewards/correctness_reward_func/mean": 0.8499999642372131, "rewards/correctness_reward_func/std": 0.09045340120792389, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1942 }, { "completion_length": 891.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1698.0, "completions/max_terminated_length": 1698.0, "completions/mean_length": 891.5833740234375, "completions/mean_terminated_length": 891.5833740234375, "completions/min_length": 503.0, "completions/min_terminated_length": 503.0, "epoch": 0.6590909090909091, "frac_reward_zero_std": 0.5, "grad_norm": 0.0628848746418953, "kl": 0.0, "learning_rate": 1.7356797791580398e-07, "loss": -0.0005, "num_tokens": 60721806.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1943 }, { "completion_length": 1081.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1750.0, "completions/max_terminated_length": 1750.0, "completions/mean_length": 1081.8333740234375, "completions/mean_terminated_length": 1081.8333740234375, "completions/min_length": 552.0, "completions/min_terminated_length": 552.0, "epoch": 0.6594301221166893, "frac_reward_zero_std": 0.5, "grad_norm": 0.06809427589178085, "kl": 0.0, "learning_rate": 1.733954451345756e-07, "loss": 0.0002, "num_tokens": 60745510.0, "reward": 1.2166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298573225736618, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1944 }, { "completion_length": 996.6667175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 2762.0, "completions/max_terminated_length": 2762.0, "completions/mean_length": 996.6666870117188, "completions/mean_terminated_length": 996.6666870117188, "completions/min_length": 541.0, "completions/min_terminated_length": 541.0, "epoch": 0.6597693351424695, "frac_reward_zero_std": 0.5, "grad_norm": 0.05918768793344498, "kl": 0.0, "learning_rate": 1.7322291235334714e-07, "loss": -0.0003, "num_tokens": 60769614.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1945 }, { "completion_length": 947.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2131.0, "completions/max_terminated_length": 2131.0, "completions/mean_length": 947.9166870117188, "completions/mean_terminated_length": 947.9166870117188, "completions/min_length": 362.0, "completions/min_terminated_length": 362.0, "epoch": 0.6601085481682497, "frac_reward_zero_std": 0.5, "grad_norm": 0.34079509973526, "kl": 0.0, "learning_rate": 1.730503795721187e-07, "loss": 0.0066, "num_tokens": 60795335.0, "reward": 1.133333444595337, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.2806917726993561, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1946 }, { "completion_length": 1033.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2015.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 1033.5, "completions/mean_terminated_length": 1033.5, "completions/min_length": 574.0, "completions/min_terminated_length": 574.0, "epoch": 0.6604477611940298, "frac_reward_zero_std": 0.5, "grad_norm": 0.3229091763496399, "kl": 0.0, "learning_rate": 1.7287784679089027e-07, "loss": 0.0045, "num_tokens": 60821345.0, "reward": 0.7666667699813843, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1947 }, { "completion_length": 633.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1007.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 633.3333740234375, "completions/mean_terminated_length": 633.3333740234375, "completions/min_length": 414.0, "completions/min_terminated_length": 414.0, "epoch": 0.66078697421981, "frac_reward_zero_std": 1.0, "grad_norm": 1.324584530948414e-07, "kl": 0.0, "learning_rate": 1.7270531400966182e-07, "loss": 0.0, "num_tokens": 60842721.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1948 }, { "completion_length": 898.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1437.0, "completions/max_terminated_length": 1437.0, "completions/mean_length": 898.9166870117188, "completions/mean_terminated_length": 898.9166870117188, "completions/min_length": 484.0, "completions/min_terminated_length": 484.0, "epoch": 0.6611261872455902, "frac_reward_zero_std": 0.0, "grad_norm": 0.1165386214852333, "kl": 0.0, "learning_rate": 1.725327812284334e-07, "loss": -0.0006, "num_tokens": 60864026.0, "reward": 1.1666667461395264, "reward_std": 0.09559705853462219, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1949 }, { "completion_length": 1022.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2843.0, "completions/max_terminated_length": 2843.0, "completions/mean_length": 1022.3333740234375, "completions/mean_terminated_length": 1022.3333740234375, "completions/min_length": 481.0, "completions/min_terminated_length": 481.0, "epoch": 0.6614654002713705, "frac_reward_zero_std": 0.5, "grad_norm": 0.5209102630615234, "kl": 0.0, "learning_rate": 1.7236024844720496e-07, "loss": -0.0206, "num_tokens": 60892158.0, "reward": 0.7833334803581238, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.4833333194255829, "rewards/correctness_reward_func/std": 0.43029236793518066, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1950 }, { "completion_length": 1579.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3833.0, "completions/max_terminated_length": 3833.0, "completions/mean_length": 1579.0833740234375, "completions/mean_terminated_length": 1579.0833740234375, "completions/min_length": 904.0, "completions/min_terminated_length": 904.0, "epoch": 0.6618046132971506, "frac_reward_zero_std": 1.0, "grad_norm": 1.8394840139990265e-07, "kl": 0.0, "learning_rate": 1.7218771566597654e-07, "loss": 0.0, "num_tokens": 60926677.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1951 }, { "completion_length": 982.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2168.0, "completions/max_terminated_length": 2168.0, "completions/mean_length": 982.9166870117188, "completions/mean_terminated_length": 982.9166870117188, "completions/min_length": 522.0, "completions/min_terminated_length": 522.0, "epoch": 0.6621438263229308, "frac_reward_zero_std": 0.5, "grad_norm": 0.05408381670713425, "kl": 0.0, "learning_rate": 1.720151828847481e-07, "loss": -0.0012, "num_tokens": 60948612.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1952 }, { "completion_length": 1870.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5112.0, "completions/max_terminated_length": 5112.0, "completions/mean_length": 1870.75, "completions/mean_terminated_length": 1870.75, "completions/min_length": 439.0, "completions/min_terminated_length": 439.0, "epoch": 0.662483039348711, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.7184265010351967e-07, "loss": 0.0, "num_tokens": 60982767.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1953 }, { "completion_length": 614.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1214.0, "completions/max_terminated_length": 1214.0, "completions/mean_length": 614.25, "completions/mean_terminated_length": 614.25, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "epoch": 0.6628222523744912, "frac_reward_zero_std": 1.0, "grad_norm": 1.6663828716900753e-07, "kl": 0.0, "learning_rate": 1.7167011732229122e-07, "loss": 0.0, "num_tokens": 61003086.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1954 }, { "completion_length": 814.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1380.0, "completions/max_terminated_length": 1380.0, "completions/mean_length": 814.8333740234375, "completions/mean_terminated_length": 814.8333740234375, "completions/min_length": 411.0, "completions/min_terminated_length": 411.0, "epoch": 0.6631614654002713, "frac_reward_zero_std": 0.5, "grad_norm": 0.09502943605184555, "kl": 0.0, "learning_rate": 1.714975845410628e-07, "loss": 0.0008, "num_tokens": 61026400.0, "reward": 1.1666667461395264, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1955 }, { "completion_length": 1044.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1956.0, "completions/max_terminated_length": 1956.0, "completions/mean_length": 1044.916748046875, "completions/mean_terminated_length": 1044.916748046875, "completions/min_length": 602.0, "completions/min_terminated_length": 602.0, "epoch": 0.6635006784260515, "frac_reward_zero_std": 1.0, "grad_norm": 1.2519059566784563e-07, "kl": 0.0, "learning_rate": 1.7132505175983435e-07, "loss": 0.0, "num_tokens": 61050411.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1956 }, { "completion_length": 837.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1387.0, "completions/max_terminated_length": 1387.0, "completions/mean_length": 837.1666870117188, "completions/mean_terminated_length": 837.1666870117188, "completions/min_length": 528.0, "completions/min_terminated_length": 528.0, "epoch": 0.6638398914518318, "frac_reward_zero_std": 0.0, "grad_norm": 0.0954817458987236, "kl": 0.0, "learning_rate": 1.711525189786059e-07, "loss": 0.0016, "num_tokens": 61071389.0, "reward": 1.1666667461395264, "reward_std": 0.09559705853462219, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1957 }, { "completion_length": 971.6667175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 2219.0, "completions/max_terminated_length": 2219.0, "completions/mean_length": 971.6666870117188, "completions/mean_terminated_length": 971.6666870117188, "completions/min_length": 485.0, "completions/min_terminated_length": 485.0, "epoch": 0.664179104477612, "frac_reward_zero_std": 1.0, "grad_norm": 9.648132959227951e-08, "kl": 0.0, "learning_rate": 1.7097998619737749e-07, "loss": 0.0, "num_tokens": 61092403.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1958 }, { "completion_length": 828.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1337.0, "completions/max_terminated_length": 1337.0, "completions/mean_length": 828.4166870117188, "completions/mean_terminated_length": 828.4166870117188, "completions/min_length": 633.0, "completions/min_terminated_length": 633.0, "epoch": 0.6645183175033921, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.7080745341614904e-07, "loss": 0.0, "num_tokens": 61112928.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1959 }, { "completion_length": 2277.2500610351562, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5110.0, "completions/mean_length": 2826.33349609375, "completions/mean_terminated_length": 2484.272705078125, "completions/min_length": 638.0, "completions/min_terminated_length": 638.0, "epoch": 0.6648575305291723, "frac_reward_zero_std": 0.5, "grad_norm": 0.25705304741859436, "kl": NaN, "learning_rate": 1.7063492063492064e-07, "loss": -0.0459, "num_tokens": 61151817.0, "reward": 0.6083333492279053, "reward_std": 0.22453658282756805, "rewards/correctness_reward_func/mean": 0.3333333432674408, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 1960 }, { "completion_length": 2505.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4011.0, "completions/max_terminated_length": 4011.0, "completions/mean_length": 2505.416748046875, "completions/mean_terminated_length": 2505.416748046875, "completions/min_length": 1419.0, "completions/min_terminated_length": 1419.0, "epoch": 0.6651967435549525, "frac_reward_zero_std": 1.0, "grad_norm": 3.2638635616422107e-07, "kl": 0.0, "learning_rate": 1.704623878536922e-07, "loss": 0.0, "num_tokens": 61197878.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1961 }, { "completion_length": 707.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1568.0, "completions/max_terminated_length": 1568.0, "completions/mean_length": 707.9166870117188, "completions/mean_terminated_length": 707.9166870117188, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.6655359565807327, "frac_reward_zero_std": 0.5, "grad_norm": 0.05857473984360695, "kl": 0.0, "learning_rate": 1.7028985507246378e-07, "loss": 0.0002, "num_tokens": 61218523.0, "reward": 0.7166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1962 }, { "completion_length": 731.5000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1742.0, "completions/max_terminated_length": 1742.0, "completions/mean_length": 731.5, "completions/mean_terminated_length": 731.5, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "epoch": 0.6658751696065129, "frac_reward_zero_std": 1.0, "grad_norm": 2.3607528021329927e-07, "kl": 0.0, "learning_rate": 1.7011732229123533e-07, "loss": 0.0, "num_tokens": 61237339.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1963 }, { "completion_length": 905.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1630.0, "completions/max_terminated_length": 1630.0, "completions/mean_length": 905.75, "completions/mean_terminated_length": 905.75, "completions/min_length": 486.0, "completions/min_terminated_length": 486.0, "epoch": 0.666214382632293, "frac_reward_zero_std": 0.5, "grad_norm": 0.07896900922060013, "kl": 0.0, "learning_rate": 1.699447895100069e-07, "loss": -0.0017, "num_tokens": 61259902.0, "reward": 1.2166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298573225736618, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1964 }, { "completion_length": 848.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2951.0, "completions/max_terminated_length": 2951.0, "completions/mean_length": 848.1666870117188, "completions/mean_terminated_length": 848.1666870117188, "completions/min_length": 411.0, "completions/min_terminated_length": 411.0, "epoch": 0.6665535956580733, "frac_reward_zero_std": 0.5, "grad_norm": 0.18378067016601562, "kl": 0.0, "learning_rate": 1.6977225672877846e-07, "loss": -0.0017, "num_tokens": 61280466.0, "reward": 1.0333335399627686, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.7333333492279053, "rewards/correctness_reward_func/std": 0.23094011843204498, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1965 }, { "completion_length": 1583.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 5196.0, "completions/max_terminated_length": 5196.0, "completions/mean_length": 1583.8333740234375, "completions/mean_terminated_length": 1583.8333740234375, "completions/min_length": 439.0, "completions/min_terminated_length": 439.0, "epoch": 0.6668928086838535, "frac_reward_zero_std": 0.5, "grad_norm": 0.4428046941757202, "kl": 0.0, "learning_rate": 1.6959972394755004e-07, "loss": 0.0267, "num_tokens": 61313746.0, "reward": 0.6333333253860474, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.3333333432674408, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1966 }, { "completion_length": 744.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1380.0, "completions/max_terminated_length": 1380.0, "completions/mean_length": 744.1666870117188, "completions/mean_terminated_length": 744.1666870117188, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "epoch": 0.6672320217096337, "frac_reward_zero_std": 0.0, "grad_norm": 0.3907524645328522, "kl": 0.0, "learning_rate": 1.694271911663216e-07, "loss": -0.0044, "num_tokens": 61332318.0, "reward": 0.8666666746139526, "reward_std": 0.24494892358779907, "rewards/correctness_reward_func/mean": 0.5666666626930237, "rewards/correctness_reward_func/std": 0.5033223032951355, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1967 }, { "completion_length": 2277.916748046875, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 5941.0, "completions/mean_length": 3376.08349609375, "completions/mean_terminated_length": 2733.5, "completions/min_length": 830.0, "completions/min_terminated_length": 830.0, "epoch": 0.6675712347354138, "frac_reward_zero_std": 0.0, "grad_norm": 0.7467960715293884, "kl": NaN, "learning_rate": 1.6925465838509315e-07, "loss": -0.0507, "num_tokens": 61369175.0, "reward": 0.8458334803581238, "reward_std": 0.3334067463874817, "rewards/correctness_reward_func/mean": 0.5833333134651184, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.09323723614215851, "step": 1968 }, { "completion_length": 1644.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3718.0, "completions/max_terminated_length": 3718.0, "completions/mean_length": 1644.916748046875, "completions/mean_terminated_length": 1644.916748046875, "completions/min_length": 564.0, "completions/min_terminated_length": 564.0, "epoch": 0.667910447761194, "frac_reward_zero_std": 1.0, "grad_norm": 1.7251818462682422e-07, "kl": 0.0, "learning_rate": 1.6908212560386473e-07, "loss": 0.0, "num_tokens": 61396366.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1969 }, { "completion_length": 1623.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2482.0, "completions/max_terminated_length": 2482.0, "completions/mean_length": 1623.5, "completions/mean_terminated_length": 1623.5, "completions/min_length": 1026.0, "completions/min_terminated_length": 1026.0, "epoch": 0.6682496607869742, "frac_reward_zero_std": 1.0, "grad_norm": 1.351661182980024e-07, "kl": 0.0, "learning_rate": 1.6890959282263628e-07, "loss": 0.0, "num_tokens": 61431862.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1970 }, { "completion_length": 1217.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 1914.0, "completions/max_terminated_length": 1914.0, "completions/mean_length": 1217.0, "completions/mean_terminated_length": 1217.0, "completions/min_length": 681.0, "completions/min_terminated_length": 681.0, "epoch": 0.6685888738127544, "frac_reward_zero_std": 0.5, "grad_norm": 0.09548959136009216, "kl": 0.0, "learning_rate": 1.6873706004140786e-07, "loss": -0.0002, "num_tokens": 61461682.0, "reward": 0.7666666507720947, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1971 }, { "completion_length": 1961.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3959.0, "completions/max_terminated_length": 3959.0, "completions/mean_length": 1961.666748046875, "completions/mean_terminated_length": 1961.666748046875, "completions/min_length": 684.0, "completions/min_terminated_length": 684.0, "epoch": 0.6689280868385346, "frac_reward_zero_std": 0.5, "grad_norm": 0.3524186909198761, "kl": 0.0, "learning_rate": 1.685645272601794e-07, "loss": 0.0027, "num_tokens": 61498638.0, "reward": 1.066666603088379, "reward_std": 0.20655910670757294, "rewards/correctness_reward_func/mean": 0.7666666507720947, "rewards/correctness_reward_func/std": 0.3700941503047943, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1972 }, { "completion_length": 569.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 788.0, "completions/max_terminated_length": 788.0, "completions/mean_length": 569.6666870117188, "completions/mean_terminated_length": 569.6666870117188, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "epoch": 0.6692672998643148, "frac_reward_zero_std": 1.0, "grad_norm": 9.764529806943756e-08, "kl": 0.0, "learning_rate": 1.68391994478951e-07, "loss": 0.0, "num_tokens": 61520090.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1973 }, { "completion_length": 752.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1271.0, "completions/max_terminated_length": 1271.0, "completions/mean_length": 752.3333740234375, "completions/mean_terminated_length": 752.3333740234375, "completions/min_length": 421.0, "completions/min_terminated_length": 421.0, "epoch": 0.669606512890095, "frac_reward_zero_std": 1.0, "grad_norm": 1.4135873982468183e-07, "kl": 0.0, "learning_rate": 1.6821946169772254e-07, "loss": 0.0, "num_tokens": 61543008.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1974 }, { "completion_length": 853.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1526.0, "completions/max_terminated_length": 1526.0, "completions/mean_length": 853.3333740234375, "completions/mean_terminated_length": 853.3333740234375, "completions/min_length": 463.0, "completions/min_terminated_length": 463.0, "epoch": 0.6699457259158752, "frac_reward_zero_std": 0.0, "grad_norm": 0.08454319089651108, "kl": 0.0, "learning_rate": 1.6804692891649415e-07, "loss": -0.0014, "num_tokens": 61565914.0, "reward": 1.2666666507720947, "reward_std": 0.08164961636066437, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1975 }, { "completion_length": 1103.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2111.0, "completions/max_terminated_length": 2111.0, "completions/mean_length": 1103.166748046875, "completions/mean_terminated_length": 1103.166748046875, "completions/min_length": 514.0, "completions/min_terminated_length": 514.0, "epoch": 0.6702849389416553, "frac_reward_zero_std": 0.5, "grad_norm": 0.09896402806043625, "kl": 0.0, "learning_rate": 1.678743961352657e-07, "loss": 0.0009, "num_tokens": 61589238.0, "reward": 1.25, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1976 }, { "completion_length": 2953.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 6375.0, "completions/max_terminated_length": 6375.0, "completions/mean_length": 2953.58349609375, "completions/mean_terminated_length": 2953.58349609375, "completions/min_length": 533.0, "completions/min_terminated_length": 533.0, "epoch": 0.6706241519674355, "frac_reward_zero_std": 0.5, "grad_norm": 0.12272366881370544, "kl": 0.0, "learning_rate": 1.6770186335403728e-07, "loss": -0.0028, "num_tokens": 61635391.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1977 }, { "completion_length": 1599.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3130.0, "completions/max_terminated_length": 3130.0, "completions/mean_length": 1599.8333740234375, "completions/mean_terminated_length": 1599.8333740234375, "completions/min_length": 686.0, "completions/min_terminated_length": 686.0, "epoch": 0.6709633649932157, "frac_reward_zero_std": 1.0, "grad_norm": 2.0407178169534745e-07, "kl": 0.0, "learning_rate": 1.6752933057280883e-07, "loss": 0.0, "num_tokens": 61664387.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1978 }, { "completion_length": 1722.3334350585938, "completions/clipped_ratio": 0.0, "completions/max_length": 3803.0, "completions/max_terminated_length": 3803.0, "completions/mean_length": 1722.3333740234375, "completions/mean_terminated_length": 1722.3333740234375, "completions/min_length": 570.0, "completions/min_terminated_length": 570.0, "epoch": 0.671302578018996, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.673567977915804e-07, "loss": 0.0, "num_tokens": 61700955.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1979 }, { "completion_length": 498.1666717529297, "completions/clipped_ratio": 0.0, "completions/max_length": 683.0, "completions/max_terminated_length": 683.0, "completions/mean_length": 498.16668701171875, "completions/mean_terminated_length": 498.16668701171875, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.6716417910447762, "frac_reward_zero_std": 1.0, "grad_norm": 1.1437555968996094e-07, "kl": 0.0, "learning_rate": 1.6718426501035197e-07, "loss": 0.0, "num_tokens": 61717493.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1980 }, { "completion_length": 1846.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3846.0, "completions/max_terminated_length": 3846.0, "completions/mean_length": 1846.916748046875, "completions/mean_terminated_length": 1846.916748046875, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.6719810040705563, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.6701173222912352e-07, "loss": 0.0, "num_tokens": 61749754.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1981 }, { "completion_length": 2428.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3687.0, "completions/max_terminated_length": 3687.0, "completions/mean_length": 2428.166748046875, "completions/mean_terminated_length": 2428.166748046875, "completions/min_length": 540.0, "completions/min_terminated_length": 540.0, "epoch": 0.6723202170963365, "frac_reward_zero_std": 0.5, "grad_norm": 0.5537146329879761, "kl": 0.0, "learning_rate": 1.668391994478951e-07, "loss": 0.0228, "num_tokens": 61794384.0, "reward": 0.7166666984558105, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.5149286985397339, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1982 }, { "completion_length": 445.75001525878906, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/max_terminated_length": 541.0, "completions/mean_length": 445.75, "completions/mean_terminated_length": 445.75, "completions/min_length": 367.0, "completions/min_terminated_length": 367.0, "epoch": 0.6726594301221167, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.6666666666666665e-07, "loss": 0.0, "num_tokens": 61816929.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1983 }, { "completion_length": 479.6666717529297, "completions/clipped_ratio": 0.0, "completions/max_length": 802.0, "completions/max_terminated_length": 802.0, "completions/mean_length": 479.66668701171875, "completions/mean_terminated_length": 479.66668701171875, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 0.6729986431478969, "frac_reward_zero_std": 1.0, "grad_norm": 2.4012939547901624e-07, "kl": 0.0, "learning_rate": 1.6649413388543823e-07, "loss": 0.0, "num_tokens": 61832729.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1984 }, { "completion_length": 546.5000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 938.0, "completions/max_terminated_length": 938.0, "completions/mean_length": 546.5, "completions/mean_terminated_length": 546.5, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.673337856173677, "frac_reward_zero_std": 1.0, "grad_norm": 1.215143754507153e-07, "kl": 0.0, "learning_rate": 1.6632160110420978e-07, "loss": 0.0, "num_tokens": 61854971.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1985 }, { "completion_length": 619.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 887.0, "completions/max_terminated_length": 887.0, "completions/mean_length": 619.4166870117188, "completions/mean_terminated_length": 619.4166870117188, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "epoch": 0.6736770691994572, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.6614906832298136e-07, "loss": 0.0, "num_tokens": 61873636.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1986 }, { "completion_length": 1070.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 1766.0, "completions/max_terminated_length": 1766.0, "completions/mean_length": 1070.5, "completions/mean_terminated_length": 1070.5, "completions/min_length": 489.0, "completions/min_terminated_length": 489.0, "epoch": 0.6740162822252375, "frac_reward_zero_std": 1.0, "grad_norm": 1.2579306485349662e-07, "kl": 0.0, "learning_rate": 1.6597653554175291e-07, "loss": 0.0, "num_tokens": 61898800.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1987 }, { "completion_length": 2168.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4877.0, "completions/max_terminated_length": 4877.0, "completions/mean_length": 2168.166748046875, "completions/mean_terminated_length": 2168.166748046875, "completions/min_length": 466.0, "completions/min_terminated_length": 466.0, "epoch": 0.6743554952510177, "frac_reward_zero_std": 1.0, "grad_norm": 2.26411344783628e-07, "kl": 0.0, "learning_rate": 1.658040027605245e-07, "loss": 0.0, "num_tokens": 61936806.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1988 }, { "completion_length": 1114.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 2085.0, "completions/max_terminated_length": 2085.0, "completions/mean_length": 1114.416748046875, "completions/mean_terminated_length": 1114.416748046875, "completions/min_length": 566.0, "completions/min_terminated_length": 566.0, "epoch": 0.6746947082767978, "frac_reward_zero_std": 0.0, "grad_norm": 0.48932066559791565, "kl": 0.0, "learning_rate": 1.6563146997929605e-07, "loss": 0.0077, "num_tokens": 61958969.0, "reward": 1.0499999523162842, "reward_std": 0.37606820464134216, "rewards/correctness_reward_func/mean": 0.75, "rewards/correctness_reward_func/std": 0.36306774616241455, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1989 }, { "completion_length": 1896.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4125.0, "completions/max_terminated_length": 4125.0, "completions/mean_length": 1896.8333740234375, "completions/mean_terminated_length": 1896.8333740234375, "completions/min_length": 617.0, "completions/min_terminated_length": 617.0, "epoch": 0.675033921302578, "frac_reward_zero_std": 0.5, "grad_norm": 0.09354446828365326, "kl": 0.0, "learning_rate": 1.6545893719806763e-07, "loss": 0.0004, "num_tokens": 61988565.0, "reward": 0.7666666507720947, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1990 }, { "completion_length": 2606.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5105.0, "completions/max_terminated_length": 5105.0, "completions/mean_length": 2606.666748046875, "completions/mean_terminated_length": 2606.666748046875, "completions/min_length": 673.0, "completions/min_terminated_length": 673.0, "epoch": 0.6753731343283582, "frac_reward_zero_std": 0.0, "grad_norm": 0.17809031903743744, "kl": 0.0, "learning_rate": 1.652864044168392e-07, "loss": -0.0022, "num_tokens": 62029619.0, "reward": 1.2166666984558105, "reward_std": 0.10641198605298996, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1991 }, { "completion_length": 594.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 926.0, "completions/max_terminated_length": 926.0, "completions/mean_length": 594.75, "completions/mean_terminated_length": 594.75, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 0.6757123473541384, "frac_reward_zero_std": 1.0, "grad_norm": 1.0304655262416418e-07, "kl": 0.0, "learning_rate": 1.6511387163561076e-07, "loss": 0.0, "num_tokens": 62049566.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1992 }, { "completion_length": 773.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1525.0, "completions/max_terminated_length": 1525.0, "completions/mean_length": 773.25, "completions/mean_terminated_length": 773.25, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "epoch": 0.6760515603799185, "frac_reward_zero_std": 0.0, "grad_norm": 0.09425163269042969, "kl": 0.0, "learning_rate": 1.6494133885438234e-07, "loss": 0.0017, "num_tokens": 62072849.0, "reward": 1.1666667461395264, "reward_std": 0.0955970510840416, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1993 }, { "completion_length": 598.75, "completions/clipped_ratio": 0.0, "completions/max_length": 866.0, "completions/max_terminated_length": 866.0, "completions/mean_length": 598.75, "completions/mean_terminated_length": 598.75, "completions/min_length": 415.0, "completions/min_terminated_length": 415.0, "epoch": 0.6763907734056988, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.647688060731539e-07, "loss": 0.0, "num_tokens": 62091104.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1994 }, { "completion_length": 1697.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3587.0, "completions/max_terminated_length": 3587.0, "completions/mean_length": 1697.5, "completions/mean_terminated_length": 1697.5, "completions/min_length": 707.0, "completions/min_terminated_length": 707.0, "epoch": 0.676729986431479, "frac_reward_zero_std": 0.5, "grad_norm": 0.1103680431842804, "kl": 0.0, "learning_rate": 1.6459627329192547e-07, "loss": -0.0005, "num_tokens": 62122946.0, "reward": 0.2875000238418579, "reward_std": 0.03061862289905548, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 1995 }, { "completion_length": 761.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1100.0, "completions/max_terminated_length": 1100.0, "completions/mean_length": 761.4166870117188, "completions/mean_terminated_length": 761.4166870117188, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 0.6770691994572592, "frac_reward_zero_std": 0.5, "grad_norm": 0.0881757140159607, "kl": 0.0, "learning_rate": 1.6442374051069702e-07, "loss": -0.0, "num_tokens": 62142415.0, "reward": 1.2333333492279053, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.9333333373069763, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1996 }, { "completion_length": 824.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1292.0, "completions/max_terminated_length": 1292.0, "completions/mean_length": 824.25, "completions/mean_terminated_length": 824.25, "completions/min_length": 486.0, "completions/min_terminated_length": 486.0, "epoch": 0.6774084124830394, "frac_reward_zero_std": 0.5, "grad_norm": 0.30276182293891907, "kl": 0.0, "learning_rate": 1.642512077294686e-07, "loss": 0.0011, "num_tokens": 62167162.0, "reward": 1.133333444595337, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.28069180250167847, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1997 }, { "completion_length": 1338.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 5198.0, "completions/max_terminated_length": 5198.0, "completions/mean_length": 1338.25, "completions/mean_terminated_length": 1338.25, "completions/min_length": 521.0, "completions/min_terminated_length": 521.0, "epoch": 0.6777476255088195, "frac_reward_zero_std": 0.5, "grad_norm": 0.10310881584882736, "kl": 0.0, "learning_rate": 1.6407867494824015e-07, "loss": -0.0013, "num_tokens": 62197567.0, "reward": 0.7666666507720947, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1998 }, { "completion_length": 704.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1211.0, "completions/max_terminated_length": 1211.0, "completions/mean_length": 704.4166870117188, "completions/mean_terminated_length": 704.4166870117188, "completions/min_length": 433.0, "completions/min_terminated_length": 433.0, "epoch": 0.6780868385345997, "frac_reward_zero_std": 0.5, "grad_norm": 0.09383802115917206, "kl": 0.0, "learning_rate": 1.6390614216701173e-07, "loss": -0.0005, "num_tokens": 62217588.0, "reward": 1.25, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 1999 }, { "completion_length": 1661.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2886.0, "completions/max_terminated_length": 2886.0, "completions/mean_length": 1661.75, "completions/mean_terminated_length": 1661.75, "completions/min_length": 735.0, "completions/min_terminated_length": 735.0, "epoch": 0.6784260515603799, "frac_reward_zero_std": 0.0, "grad_norm": 0.52850341796875, "kl": 0.0, "learning_rate": 1.6373360938578329e-07, "loss": 0.0041, "num_tokens": 62252535.0, "reward": 1.183333396911621, "reward_std": 0.2557638883590698, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2000 }, { "completion_length": 619.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1103.0, "completions/max_terminated_length": 1103.0, "completions/mean_length": 619.8333740234375, "completions/mean_terminated_length": 619.8333740234375, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "epoch": 0.6787652645861602, "frac_reward_zero_std": 1.0, "grad_norm": 1.448915014634622e-07, "kl": 0.0, "learning_rate": 1.6356107660455487e-07, "loss": 0.0, "num_tokens": 62272393.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2001 }, { "completion_length": 693.0833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 855.0, "completions/max_terminated_length": 855.0, "completions/mean_length": 693.0833740234375, "completions/mean_terminated_length": 693.0833740234375, "completions/min_length": 576.0, "completions/min_terminated_length": 576.0, "epoch": 0.6791044776119403, "frac_reward_zero_std": 1.0, "grad_norm": 1.78244093262947e-07, "kl": 0.0, "learning_rate": 1.6338854382332642e-07, "loss": 0.0, "num_tokens": 62293676.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2002 }, { "completion_length": 750.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1085.0, "completions/max_terminated_length": 1085.0, "completions/mean_length": 750.4166870117188, "completions/mean_terminated_length": 750.4166870117188, "completions/min_length": 535.0, "completions/min_terminated_length": 535.0, "epoch": 0.6794436906377205, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.6321601104209797e-07, "loss": 0.0, "num_tokens": 62314147.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2003 }, { "completion_length": 972.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1518.0, "completions/max_terminated_length": 1518.0, "completions/mean_length": 972.4166870117188, "completions/mean_terminated_length": 972.4166870117188, "completions/min_length": 458.0, "completions/min_terminated_length": 458.0, "epoch": 0.6797829036635007, "frac_reward_zero_std": 0.0, "grad_norm": 0.36703795194625854, "kl": 0.0, "learning_rate": 1.6304347826086955e-07, "loss": 0.0014, "num_tokens": 62338206.0, "reward": 1.183333396911621, "reward_std": 0.2557638883590698, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2004 }, { "completion_length": 1320.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2049.0, "completions/max_terminated_length": 2049.0, "completions/mean_length": 1320.0833740234375, "completions/mean_terminated_length": 1320.0833740234375, "completions/min_length": 710.0, "completions/min_terminated_length": 710.0, "epoch": 0.6801221166892809, "frac_reward_zero_std": 1.0, "grad_norm": 1.9519764293818298e-07, "kl": 0.0, "learning_rate": 1.628709454796411e-07, "loss": 0.0, "num_tokens": 62369719.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2005 }, { "completion_length": 686.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1166.0, "completions/max_terminated_length": 1166.0, "completions/mean_length": 686.75, "completions/mean_terminated_length": 686.75, "completions/min_length": 511.0, "completions/min_terminated_length": 511.0, "epoch": 0.680461329715061, "frac_reward_zero_std": 0.5, "grad_norm": 0.06025727838277817, "kl": 0.0, "learning_rate": 1.6269841269841268e-07, "loss": -0.0004, "num_tokens": 62392048.0, "reward": 0.6875, "reward_std": 0.030618607997894287, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 2006 }, { "completion_length": 1253.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2622.0, "completions/max_terminated_length": 2622.0, "completions/mean_length": 1253.75, "completions/mean_terminated_length": 1253.75, "completions/min_length": 524.0, "completions/min_terminated_length": 524.0, "epoch": 0.6808005427408412, "frac_reward_zero_std": 0.5, "grad_norm": 0.060199495404958725, "kl": 0.0, "learning_rate": 1.6252587991718426e-07, "loss": -0.0001, "num_tokens": 62419951.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2007 }, { "completion_length": 1129.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 1519.0, "completions/max_terminated_length": 1519.0, "completions/mean_length": 1129.666748046875, "completions/mean_terminated_length": 1129.666748046875, "completions/min_length": 557.0, "completions/min_terminated_length": 557.0, "epoch": 0.6811397557666214, "frac_reward_zero_std": 0.5, "grad_norm": 0.08440901339054108, "kl": 0.0, "learning_rate": 1.6235334713595584e-07, "loss": 0.0004, "num_tokens": 62444133.0, "reward": 0.7666666507720947, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2008 }, { "completion_length": 805.5000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1057.0, "completions/max_terminated_length": 1057.0, "completions/mean_length": 805.5, "completions/mean_terminated_length": 805.5, "completions/min_length": 560.0, "completions/min_terminated_length": 560.0, "epoch": 0.6814789687924017, "frac_reward_zero_std": 1.0, "grad_norm": 1.9984629773261986e-07, "kl": 0.0, "learning_rate": 1.621808143547274e-07, "loss": 0.0, "num_tokens": 62465487.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2009 }, { "completion_length": 1119.0000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 2105.0, "completions/max_terminated_length": 2105.0, "completions/mean_length": 1119.0, "completions/mean_terminated_length": 1119.0, "completions/min_length": 383.0, "completions/min_terminated_length": 383.0, "epoch": 0.6818181818181818, "frac_reward_zero_std": 0.5, "grad_norm": 0.08090119808912277, "kl": 0.0, "learning_rate": 1.6200828157349897e-07, "loss": -0.0007, "num_tokens": 62491899.0, "reward": 1.1666667461395264, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2010 }, { "completion_length": 863.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1182.0, "completions/max_terminated_length": 1182.0, "completions/mean_length": 863.8333740234375, "completions/mean_terminated_length": 863.8333740234375, "completions/min_length": 484.0, "completions/min_terminated_length": 484.0, "epoch": 0.682157394843962, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.6183574879227053e-07, "loss": 0.0, "num_tokens": 62517259.0, "reward": 0.30000001192092896, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2011 }, { "completion_length": 547.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 859.0, "completions/max_terminated_length": 859.0, "completions/mean_length": 547.4166870117188, "completions/mean_terminated_length": 547.4166870117188, "completions/min_length": 382.0, "completions/min_terminated_length": 382.0, "epoch": 0.6824966078697422, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.616632160110421e-07, "loss": 0.0, "num_tokens": 62532390.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2012 }, { "completion_length": 841.0833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1460.0, "completions/max_terminated_length": 1460.0, "completions/mean_length": 841.0833740234375, "completions/mean_terminated_length": 841.0833740234375, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "epoch": 0.6828358208955224, "frac_reward_zero_std": 0.5, "grad_norm": 0.10449093580245972, "kl": 0.0, "learning_rate": 1.6149068322981366e-07, "loss": -0.0024, "num_tokens": 62551447.0, "reward": 1.133333444595337, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2013 }, { "completion_length": 839.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1197.0, "completions/max_terminated_length": 1197.0, "completions/mean_length": 839.0, "completions/mean_terminated_length": 839.0, "completions/min_length": 564.0, "completions/min_terminated_length": 564.0, "epoch": 0.6831750339213026, "frac_reward_zero_std": 1.0, "grad_norm": 9.894851871194987e-08, "kl": 0.0, "learning_rate": 1.613181504485852e-07, "loss": 0.0, "num_tokens": 62566213.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2014 }, { "completion_length": 1352.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2730.0, "completions/max_terminated_length": 2730.0, "completions/mean_length": 1352.166748046875, "completions/mean_terminated_length": 1352.166748046875, "completions/min_length": 860.0, "completions/min_terminated_length": 860.0, "epoch": 0.6835142469470827, "frac_reward_zero_std": 0.5, "grad_norm": 0.4879356324672699, "kl": 0.0, "learning_rate": 1.611456176673568e-07, "loss": 0.0115, "num_tokens": 62593401.0, "reward": 1.133333444595337, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.28069180250167847, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2015 }, { "completion_length": 774.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1275.0, "completions/max_terminated_length": 1275.0, "completions/mean_length": 774.25, "completions/mean_terminated_length": 774.25, "completions/min_length": 567.0, "completions/min_terminated_length": 567.0, "epoch": 0.683853459972863, "frac_reward_zero_std": 0.5, "grad_norm": 0.0828675702214241, "kl": 0.0, "learning_rate": 1.6097308488612834e-07, "loss": 0.0001, "num_tokens": 62614254.0, "reward": 1.2666666507720947, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2016 }, { "completion_length": 1744.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 4210.0, "completions/max_terminated_length": 4210.0, "completions/mean_length": 1744.166748046875, "completions/mean_terminated_length": 1744.166748046875, "completions/min_length": 541.0, "completions/min_terminated_length": 541.0, "epoch": 0.6841926729986432, "frac_reward_zero_std": 0.0, "grad_norm": 0.6455689668655396, "kl": 0.0, "learning_rate": 1.6080055210489992e-07, "loss": -0.0208, "num_tokens": 62647682.0, "reward": 0.9166666865348816, "reward_std": 0.24738392233848572, "rewards/correctness_reward_func/mean": 0.6166666150093079, "rewards/correctness_reward_func/std": 0.4628632962703705, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2017 }, { "completion_length": 1005.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2194.0, "completions/max_terminated_length": 2194.0, "completions/mean_length": 1005.5833740234375, "completions/mean_terminated_length": 1005.5833740234375, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 0.6845318860244234, "frac_reward_zero_std": 0.0, "grad_norm": 0.38874953985214233, "kl": 0.0, "learning_rate": 1.6062801932367148e-07, "loss": -0.0082, "num_tokens": 62674023.0, "reward": 1.133333444595337, "reward_std": 0.24494896829128265, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.28069180250167847, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2018 }, { "completion_length": 1409.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2283.0, "completions/max_terminated_length": 2283.0, "completions/mean_length": 1409.5833740234375, "completions/mean_terminated_length": 1409.5833740234375, "completions/min_length": 864.0, "completions/min_terminated_length": 864.0, "epoch": 0.6848710990502035, "frac_reward_zero_std": 0.5, "grad_norm": 0.4037747383117676, "kl": 0.0, "learning_rate": 1.6045548654244306e-07, "loss": -0.0023, "num_tokens": 62699620.0, "reward": 0.38333332538604736, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.0833333358168602, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2019 }, { "completion_length": 1002.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1746.0, "completions/max_terminated_length": 1746.0, "completions/mean_length": 1002.3333740234375, "completions/mean_terminated_length": 1002.3333740234375, "completions/min_length": 626.0, "completions/min_terminated_length": 626.0, "epoch": 0.6852103120759837, "frac_reward_zero_std": 0.5, "grad_norm": 0.3185257315635681, "kl": 0.0, "learning_rate": 1.602829537612146e-07, "loss": -0.0003, "num_tokens": 62721938.0, "reward": 1.0, "reward_std": 0.21908903121948242, "rewards/correctness_reward_func/mean": 0.699999988079071, "rewards/correctness_reward_func/std": 0.43064433336257935, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2020 }, { "completion_length": 902.8333435058594, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 2631.0, "completions/mean_length": 2001.0, "completions/mean_terminated_length": 1083.4000244140625, "completions/min_length": 380.0, "completions/min_terminated_length": 380.0, "epoch": 0.6855495251017639, "frac_reward_zero_std": 0.5, "grad_norm": 0.6411004066467285, "kl": NaN, "learning_rate": 1.601104209799862e-07, "loss": -0.0262, "num_tokens": 62743464.0, "reward": 0.9958333373069763, "reward_std": 0.30266183614730835, "rewards/correctness_reward_func/mean": 0.7333333492279053, "rewards/correctness_reward_func/std": 0.44585633277893066, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.09323723614215851, "step": 2021 }, { "completion_length": 779.5833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1542.0, "completions/max_terminated_length": 1542.0, "completions/mean_length": 779.5833740234375, "completions/mean_terminated_length": 779.5833740234375, "completions/min_length": 461.0, "completions/min_terminated_length": 461.0, "epoch": 0.6858887381275441, "frac_reward_zero_std": 1.0, "grad_norm": 1.376884881665319e-07, "kl": 0.0, "learning_rate": 1.5993788819875774e-07, "loss": 0.0, "num_tokens": 62764525.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2022 }, { "completion_length": 1963.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5597.0, "completions/max_terminated_length": 5597.0, "completions/mean_length": 1963.666748046875, "completions/mean_terminated_length": 1963.666748046875, "completions/min_length": 897.0, "completions/min_terminated_length": 897.0, "epoch": 0.6862279511533242, "frac_reward_zero_std": 0.5, "grad_norm": 0.799390971660614, "kl": 0.0, "learning_rate": 1.5976535541752935e-07, "loss": 0.0397, "num_tokens": 62799369.0, "reward": 1.1166667938232422, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.8166666030883789, "rewards/correctness_reward_func/std": 0.27579087018966675, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2023 }, { "completion_length": 1884.0834350585938, "completions/clipped_ratio": 0.0, "completions/max_length": 5482.0, "completions/max_terminated_length": 5482.0, "completions/mean_length": 1884.0833740234375, "completions/mean_terminated_length": 1884.0833740234375, "completions/min_length": 530.0, "completions/min_terminated_length": 530.0, "epoch": 0.6865671641791045, "frac_reward_zero_std": 0.0, "grad_norm": 0.47000551223754883, "kl": 0.0, "learning_rate": 1.595928226363009e-07, "loss": 0.0066, "num_tokens": 62835610.0, "reward": 1.0833333730697632, "reward_std": 0.21807155013084412, "rewards/correctness_reward_func/mean": 0.7833333015441895, "rewards/correctness_reward_func/std": 0.26227444410324097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2024 }, { "completion_length": 1503.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3092.0, "completions/max_terminated_length": 3092.0, "completions/mean_length": 1503.3333740234375, "completions/mean_terminated_length": 1503.3333740234375, "completions/min_length": 614.0, "completions/min_terminated_length": 614.0, "epoch": 0.6869063772048847, "frac_reward_zero_std": 0.0, "grad_norm": 0.11690549552440643, "kl": 0.0, "learning_rate": 1.5942028985507245e-07, "loss": 0.0007, "num_tokens": 62866118.0, "reward": 1.2000000476837158, "reward_std": 0.08164963126182556, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2025 }, { "completion_length": 1004.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2635.0, "completions/max_terminated_length": 2635.0, "completions/mean_length": 1004.4166870117188, "completions/mean_terminated_length": 1004.4166870117188, "completions/min_length": 399.0, "completions/min_terminated_length": 399.0, "epoch": 0.6872455902306649, "frac_reward_zero_std": 1.0, "grad_norm": 2.477042357895698e-07, "kl": 0.0, "learning_rate": 1.5924775707384403e-07, "loss": 0.0, "num_tokens": 62890231.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2026 }, { "completion_length": 506.25001525878906, "completions/clipped_ratio": 0.0, "completions/max_length": 1305.0, "completions/max_terminated_length": 1305.0, "completions/mean_length": 506.25, "completions/mean_terminated_length": 506.25, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.687584803256445, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.5907522429261558e-07, "loss": 0.0, "num_tokens": 62910052.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2027 }, { "completion_length": 1173.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2191.0, "completions/max_terminated_length": 2191.0, "completions/mean_length": 1173.8333740234375, "completions/mean_terminated_length": 1173.8333740234375, "completions/min_length": 519.0, "completions/min_terminated_length": 519.0, "epoch": 0.6879240162822252, "frac_reward_zero_std": 1.0, "grad_norm": 1.278587689057531e-07, "kl": 0.0, "learning_rate": 1.5890269151138716e-07, "loss": 0.0, "num_tokens": 62937290.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2028 }, { "completion_length": 2332.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 5934.0, "completions/max_terminated_length": 5934.0, "completions/mean_length": 2332.166748046875, "completions/mean_terminated_length": 2332.166748046875, "completions/min_length": 481.0, "completions/min_terminated_length": 481.0, "epoch": 0.6882632293080054, "frac_reward_zero_std": 0.5, "grad_norm": 0.3408600986003876, "kl": 0.0, "learning_rate": 1.5873015873015872e-07, "loss": 0.0274, "num_tokens": 62981482.0, "reward": 0.8666666746139526, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.5666666626930237, "rewards/correctness_reward_func/std": 0.5033223032951355, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2029 }, { "completion_length": 1085.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 1600.0, "completions/max_terminated_length": 1600.0, "completions/mean_length": 1085.416748046875, "completions/mean_terminated_length": 1085.416748046875, "completions/min_length": 537.0, "completions/min_terminated_length": 537.0, "epoch": 0.6886024423337856, "frac_reward_zero_std": 1.0, "grad_norm": 2.5619479515626153e-07, "kl": 0.0, "learning_rate": 1.585576259489303e-07, "loss": 0.0, "num_tokens": 63006567.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2030 }, { "completion_length": 1176.6667175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 1767.0, "completions/max_terminated_length": 1767.0, "completions/mean_length": 1176.666748046875, "completions/mean_terminated_length": 1176.666748046875, "completions/min_length": 504.0, "completions/min_terminated_length": 504.0, "epoch": 0.6889416553595658, "frac_reward_zero_std": 0.5, "grad_norm": 0.07176148891448975, "kl": 0.0, "learning_rate": 1.5838509316770185e-07, "loss": -0.0007, "num_tokens": 63033269.0, "reward": 0.7749999761581421, "reward_std": 0.03872983902692795, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 2031 }, { "completion_length": 827.5000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1624.0, "completions/max_terminated_length": 1624.0, "completions/mean_length": 827.5, "completions/mean_terminated_length": 827.5, "completions/min_length": 441.0, "completions/min_terminated_length": 441.0, "epoch": 0.689280868385346, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.5821256038647343e-07, "loss": 0.0, "num_tokens": 63052181.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2032 }, { "completion_length": 1228.5000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 4204.0, "completions/max_terminated_length": 4204.0, "completions/mean_length": 1228.5, "completions/mean_terminated_length": 1228.5, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.6896200814111262, "frac_reward_zero_std": 0.5, "grad_norm": 0.06624264270067215, "kl": 0.0, "learning_rate": 1.5804002760524498e-07, "loss": 0.0008, "num_tokens": 63074153.0, "reward": 0.75, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.44999995827674866, "rewards/correctness_reward_func/std": 0.47577688097953796, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2033 }, { "completion_length": 865.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1190.0, "completions/max_terminated_length": 1190.0, "completions/mean_length": 865.9166870117188, "completions/mean_terminated_length": 865.9166870117188, "completions/min_length": 459.0, "completions/min_terminated_length": 459.0, "epoch": 0.6899592944369064, "frac_reward_zero_std": 1.0, "grad_norm": 1.8322708683626843e-07, "kl": 0.0, "learning_rate": 1.5786749482401656e-07, "loss": 0.0, "num_tokens": 63094420.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2034 }, { "completion_length": 1244.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3118.0, "completions/max_terminated_length": 3118.0, "completions/mean_length": 1244.0, "completions/mean_terminated_length": 1244.0, "completions/min_length": 516.0, "completions/min_terminated_length": 516.0, "epoch": 0.6902985074626866, "frac_reward_zero_std": 0.5, "grad_norm": 0.11194341629743576, "kl": 0.0, "learning_rate": 1.576949620427881e-07, "loss": 0.0024, "num_tokens": 63125422.0, "reward": 0.75, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.44999995827674866, "rewards/correctness_reward_func/std": 0.47577688097953796, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2035 }, { "completion_length": 1643.8333740234375, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 6292.0, "completions/mean_length": 2742.0, "completions/mean_terminated_length": 1972.5999755859375, "completions/min_length": 589.0, "completions/min_terminated_length": 589.0, "epoch": 0.6906377204884667, "frac_reward_zero_std": 0.5, "grad_norm": 0.25398892164230347, "kl": NaN, "learning_rate": 1.575224292615597e-07, "loss": -0.0186, "num_tokens": 63154880.0, "reward": 0.75, "reward_std": 0.0774596780538559, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.2500000298023224, "rewards/format_reward_func/std": 0.11677484214305878, "step": 2036 }, { "completion_length": 891.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1355.0, "completions/max_terminated_length": 1355.0, "completions/mean_length": 891.8333740234375, "completions/mean_terminated_length": 891.8333740234375, "completions/min_length": 635.0, "completions/min_terminated_length": 635.0, "epoch": 0.6909769335142469, "frac_reward_zero_std": 0.0, "grad_norm": 0.5131769776344299, "kl": 0.0, "learning_rate": 1.5734989648033124e-07, "loss": -0.002, "num_tokens": 63174696.0, "reward": 1.183333396911621, "reward_std": 0.24082478880882263, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2037 }, { "completion_length": 1098.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1488.0, "completions/max_terminated_length": 1488.0, "completions/mean_length": 1098.166748046875, "completions/mean_terminated_length": 1098.166748046875, "completions/min_length": 740.0, "completions/min_terminated_length": 740.0, "epoch": 0.6913161465400272, "frac_reward_zero_std": 0.5, "grad_norm": 0.10775512456893921, "kl": 0.0, "learning_rate": 1.5717736369910282e-07, "loss": 0.0009, "num_tokens": 63203270.0, "reward": 1.2666666507720947, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2038 }, { "completion_length": 973.4167175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 2317.0, "completions/max_terminated_length": 2317.0, "completions/mean_length": 973.4166870117188, "completions/mean_terminated_length": 973.4166870117188, "completions/min_length": 459.0, "completions/min_terminated_length": 459.0, "epoch": 0.6916553595658074, "frac_reward_zero_std": 1.0, "grad_norm": 2.646088148594572e-07, "kl": 0.0, "learning_rate": 1.570048309178744e-07, "loss": 0.0, "num_tokens": 63224317.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2039 }, { "completion_length": 1180.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1953.0, "completions/max_terminated_length": 1953.0, "completions/mean_length": 1180.75, "completions/mean_terminated_length": 1180.75, "completions/min_length": 535.0, "completions/min_terminated_length": 535.0, "epoch": 0.6919945725915875, "frac_reward_zero_std": 1.0, "grad_norm": 9.28447576598046e-08, "kl": 0.0, "learning_rate": 1.5683229813664596e-07, "loss": 0.0, "num_tokens": 63246886.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2040 }, { "completion_length": 747.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1243.0, "completions/max_terminated_length": 1243.0, "completions/mean_length": 747.25, "completions/mean_terminated_length": 747.25, "completions/min_length": 430.0, "completions/min_terminated_length": 430.0, "epoch": 0.6923337856173677, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.5665976535541754e-07, "loss": 0.0, "num_tokens": 63264673.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2041 }, { "completion_length": 803.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1633.0, "completions/max_terminated_length": 1633.0, "completions/mean_length": 803.25, "completions/mean_terminated_length": 803.25, "completions/min_length": 492.0, "completions/min_terminated_length": 492.0, "epoch": 0.6926729986431479, "frac_reward_zero_std": 0.5, "grad_norm": 0.3430081605911255, "kl": 0.0, "learning_rate": 1.564872325741891e-07, "loss": -0.0002, "num_tokens": 63280414.0, "reward": 0.8833333849906921, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.5833333134651184, "rewards/correctness_reward_func/std": 0.5149286389350891, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2042 }, { "completion_length": 826.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1323.0, "completions/max_terminated_length": 1323.0, "completions/mean_length": 826.8333740234375, "completions/mean_terminated_length": 826.8333740234375, "completions/min_length": 469.0, "completions/min_terminated_length": 469.0, "epoch": 0.6930122116689281, "frac_reward_zero_std": 0.5, "grad_norm": 0.07445285469293594, "kl": 0.0, "learning_rate": 1.5631469979296067e-07, "loss": 0.0, "num_tokens": 63303104.0, "reward": 0.7666666507720947, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2043 }, { "completion_length": 633.3333740234375, "completions/clipped_ratio": 0.41666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 4448.0, "completions/mean_length": 3378.75, "completions/mean_terminated_length": 1085.71435546875, "completions/min_length": 366.0, "completions/min_terminated_length": 366.0, "epoch": 0.6933514246947082, "frac_reward_zero_std": 0.0, "grad_norm": 0.23700635135173798, "kl": NaN, "learning_rate": 1.5614216701173222e-07, "loss": -0.0145, "num_tokens": 63326232.0, "reward": 0.6583333015441895, "reward_std": 0.10206204652786255, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.17499999701976776, "rewards/format_reward_func/std": 0.15447859466075897, "step": 2044 }, { "completion_length": 2833.8334350585938, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 5314.0, "completions/mean_length": 3932.0, "completions/mean_terminated_length": 3400.60009765625, "completions/min_length": 2185.0, "completions/min_terminated_length": 2185.0, "epoch": 0.6936906377204884, "frac_reward_zero_std": 0.5, "grad_norm": 0.8445065021514893, "kl": NaN, "learning_rate": 1.559696342305038e-07, "loss": -0.0763, "num_tokens": 63372946.0, "reward": 0.5291666984558105, "reward_std": 0.26571446657180786, "rewards/correctness_reward_func/mean": 0.2666666805744171, "rewards/correctness_reward_func/std": 0.393892765045166, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.09323723614215851, "step": 2045 }, { "completion_length": 1124.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 2259.0, "completions/max_terminated_length": 2259.0, "completions/mean_length": 1124.666748046875, "completions/mean_terminated_length": 1124.666748046875, "completions/min_length": 420.0, "completions/min_terminated_length": 420.0, "epoch": 0.6940298507462687, "frac_reward_zero_std": 0.5, "grad_norm": 0.04320630058646202, "kl": 0.0, "learning_rate": 1.5579710144927535e-07, "loss": 0.0007, "num_tokens": 63400512.0, "reward": 0.7166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2046 }, { "completion_length": 1328.75, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 5579.0, "completions/mean_length": 2976.0, "completions/mean_terminated_length": 1771.6666259765625, "completions/min_length": 762.0, "completions/min_terminated_length": 762.0, "epoch": 0.6943690637720489, "frac_reward_zero_std": 0.0, "grad_norm": 0.25639498233795166, "kl": NaN, "learning_rate": 1.5562456866804693e-07, "loss": -0.0181, "num_tokens": 63430509.0, "reward": 0.6750000715255737, "reward_std": 0.13693061470985413, "rewards/correctness_reward_func/mean": 0.45000001788139343, "rewards/correctness_reward_func/std": 0.47577688097953796, "rewards/format_reward_func/mean": 0.2250000238418579, "rewards/format_reward_func/std": 0.13568010926246643, "step": 2047 }, { "completion_length": 507.66668701171875, "completions/clipped_ratio": 0.0, "completions/max_length": 707.0, "completions/max_terminated_length": 707.0, "completions/mean_length": 507.66668701171875, "completions/mean_terminated_length": 507.66668701171875, "completions/min_length": 406.0, "completions/min_terminated_length": 406.0, "epoch": 0.694708276797829, "frac_reward_zero_std": 0.5, "grad_norm": 0.06822038441896439, "kl": 0.0, "learning_rate": 1.5545203588681848e-07, "loss": -0.0003, "num_tokens": 63445565.0, "reward": 1.1666667461395264, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2048 }, { "completion_length": 1740.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 4154.0, "completions/max_terminated_length": 4154.0, "completions/mean_length": 1740.0, "completions/mean_terminated_length": 1740.0, "completions/min_length": 761.0, "completions/min_terminated_length": 761.0, "epoch": 0.6950474898236092, "frac_reward_zero_std": 0.5, "grad_norm": 0.09029985219240189, "kl": 0.0, "learning_rate": 1.5527950310559004e-07, "loss": -0.0001, "num_tokens": 63478523.0, "reward": 0.7666666507720947, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2049 }, { "completion_length": 1367.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3095.0, "completions/max_terminated_length": 3095.0, "completions/mean_length": 1367.75, "completions/mean_terminated_length": 1367.75, "completions/min_length": 553.0, "completions/min_terminated_length": 553.0, "epoch": 0.6953867028493894, "frac_reward_zero_std": 0.5, "grad_norm": 0.06197202950716019, "kl": 0.0, "learning_rate": 1.5510697032436162e-07, "loss": -0.0001, "num_tokens": 63509612.0, "reward": 0.7833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2050 }, { "completion_length": 1539.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2806.0, "completions/max_terminated_length": 2806.0, "completions/mean_length": 1539.5833740234375, "completions/mean_terminated_length": 1539.5833740234375, "completions/min_length": 518.0, "completions/min_terminated_length": 518.0, "epoch": 0.6957259158751696, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.5493443754313317e-07, "loss": 0.0, "num_tokens": 63538851.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2051 }, { "completion_length": 933.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 2015.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 933.3333740234375, "completions/mean_terminated_length": 933.3333740234375, "completions/min_length": 463.0, "completions/min_terminated_length": 463.0, "epoch": 0.6960651289009498, "frac_reward_zero_std": 0.5, "grad_norm": 0.49535584449768066, "kl": 0.0, "learning_rate": 1.5476190476190475e-07, "loss": -0.0138, "num_tokens": 63560269.0, "reward": 1.0, "reward_std": 0.21908903121948242, "rewards/correctness_reward_func/mean": 0.699999988079071, "rewards/correctness_reward_func/std": 0.43064433336257935, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2052 }, { "completion_length": 528.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 635.0, "completions/max_terminated_length": 635.0, "completions/mean_length": 528.4166870117188, "completions/mean_terminated_length": 528.4166870117188, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "epoch": 0.69640434192673, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.545893719806763e-07, "loss": 0.0, "num_tokens": 63581076.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2053 }, { "completion_length": 619.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1285.0, "completions/max_terminated_length": 1285.0, "completions/mean_length": 619.3333740234375, "completions/mean_terminated_length": 619.3333740234375, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.6967435549525102, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.544168391994479e-07, "loss": 0.0, "num_tokens": 63599218.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2054 }, { "completion_length": 1254.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 2496.0, "completions/max_terminated_length": 2496.0, "completions/mean_length": 1254.916748046875, "completions/mean_terminated_length": 1254.916748046875, "completions/min_length": 806.0, "completions/min_terminated_length": 806.0, "epoch": 0.6970827679782904, "frac_reward_zero_std": 0.0, "grad_norm": 0.33636075258255005, "kl": 0.0, "learning_rate": 1.5424430641821946e-07, "loss": 0.0026, "num_tokens": 63625581.0, "reward": 0.8333333730697632, "reward_std": 0.21493908762931824, "rewards/correctness_reward_func/mean": 0.5333333611488342, "rewards/correctness_reward_func/std": 0.47736650705337524, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2055 }, { "completion_length": 2051.0833740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6300.0, "completions/mean_length": 2600.166748046875, "completions/mean_terminated_length": 2237.54541015625, "completions/min_length": 404.0, "completions/min_terminated_length": 404.0, "epoch": 0.6974219810040706, "frac_reward_zero_std": 0.5, "grad_norm": 0.6427962779998779, "kl": NaN, "learning_rate": 1.5407177363699104e-07, "loss": -0.0563, "num_tokens": 63662392.0, "reward": 1.0416667461395264, "reward_std": 0.2457980364561081, "rewards/correctness_reward_func/mean": 0.7666667103767395, "rewards/correctness_reward_func/std": 0.25346091389656067, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 2056 }, { "completion_length": 1267.4167175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 3008.0, "completions/max_terminated_length": 3008.0, "completions/mean_length": 1267.416748046875, "completions/mean_terminated_length": 1267.416748046875, "completions/min_length": 614.0, "completions/min_terminated_length": 614.0, "epoch": 0.6977611940298507, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.538992408557626e-07, "loss": 0.0, "num_tokens": 63687837.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2057 }, { "completion_length": 673.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1506.0, "completions/max_terminated_length": 1506.0, "completions/mean_length": 673.6666870117188, "completions/mean_terminated_length": 673.6666870117188, "completions/min_length": 350.0, "completions/min_terminated_length": 350.0, "epoch": 0.6981004070556309, "frac_reward_zero_std": 0.5, "grad_norm": 0.05980612337589264, "kl": 0.0, "learning_rate": 1.5372670807453417e-07, "loss": 0.0002, "num_tokens": 63705635.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2058 }, { "completion_length": 1112.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 2108.0, "completions/max_terminated_length": 2108.0, "completions/mean_length": 1112.25, "completions/mean_terminated_length": 1112.25, "completions/min_length": 564.0, "completions/min_terminated_length": 564.0, "epoch": 0.6984396200814111, "frac_reward_zero_std": 0.5, "grad_norm": 0.10524420440196991, "kl": 0.0, "learning_rate": 1.5355417529330572e-07, "loss": 0.0021, "num_tokens": 63732842.0, "reward": 0.7333333492279053, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.43333330750465393, "rewards/correctness_reward_func/std": 0.45792683959007263, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2059 }, { "completion_length": 1384.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 2702.0, "completions/max_terminated_length": 2702.0, "completions/mean_length": 1384.916748046875, "completions/mean_terminated_length": 1384.916748046875, "completions/min_length": 832.0, "completions/min_terminated_length": 832.0, "epoch": 0.6987788331071914, "frac_reward_zero_std": 1.0, "grad_norm": 2.0923850740928174e-07, "kl": 0.0, "learning_rate": 1.5338164251207728e-07, "loss": 0.0, "num_tokens": 63760207.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2060 }, { "completion_length": 2088.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 6389.0, "completions/max_terminated_length": 6389.0, "completions/mean_length": 2088.916748046875, "completions/mean_terminated_length": 2088.916748046875, "completions/min_length": 457.0, "completions/min_terminated_length": 457.0, "epoch": 0.6991180461329715, "frac_reward_zero_std": 0.0, "grad_norm": 0.8314260244369507, "kl": 0.0, "learning_rate": 1.5320910973084886e-07, "loss": -0.0252, "num_tokens": 63798048.0, "reward": 0.9166666865348816, "reward_std": 0.4232131838798523, "rewards/correctness_reward_func/mean": 0.6166666150093079, "rewards/correctness_reward_func/std": 0.4628632962703705, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2061 }, { "completion_length": 781.0833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1969.0, "completions/max_terminated_length": 1969.0, "completions/mean_length": 781.0833740234375, "completions/mean_terminated_length": 781.0833740234375, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 0.6994572591587517, "frac_reward_zero_std": 1.0, "grad_norm": 1.117403627404201e-07, "kl": 0.0, "learning_rate": 1.530365769496204e-07, "loss": 0.0, "num_tokens": 63822793.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2062 }, { "completion_length": 497.0000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 678.0, "completions/max_terminated_length": 678.0, "completions/mean_length": 497.0, "completions/mean_terminated_length": 497.0, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 0.6997964721845319, "frac_reward_zero_std": 1.0, "grad_norm": 9.805304301835349e-08, "kl": 0.0, "learning_rate": 1.52864044168392e-07, "loss": 0.0, "num_tokens": 63840307.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2063 }, { "completion_length": 1315.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2312.0, "completions/max_terminated_length": 2312.0, "completions/mean_length": 1315.0, "completions/mean_terminated_length": 1315.0, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "epoch": 0.7001356852103121, "frac_reward_zero_std": 1.0, "grad_norm": 1.1868973359696611e-07, "kl": 0.0, "learning_rate": 1.5269151138716354e-07, "loss": 0.0, "num_tokens": 63869617.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2064 }, { "completion_length": 621.0, "completions/clipped_ratio": 0.0, "completions/max_length": 961.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 621.0, "completions/mean_terminated_length": 621.0, "completions/min_length": 411.0, "completions/min_terminated_length": 411.0, "epoch": 0.7004748982360922, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.5251897860593512e-07, "loss": 0.0, "num_tokens": 63885793.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2065 }, { "completion_length": 895.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1793.0, "completions/max_terminated_length": 1793.0, "completions/mean_length": 895.9166870117188, "completions/mean_terminated_length": 895.9166870117188, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "epoch": 0.7008141112618724, "frac_reward_zero_std": 1.0, "grad_norm": 1.5061262104154594e-07, "kl": 0.0, "learning_rate": 1.5234644582470667e-07, "loss": 0.0, "num_tokens": 63909324.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2066 }, { "completion_length": 1216.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2918.0, "completions/max_terminated_length": 2918.0, "completions/mean_length": 1216.8333740234375, "completions/mean_terminated_length": 1216.8333740234375, "completions/min_length": 340.0, "completions/min_terminated_length": 340.0, "epoch": 0.7011533242876526, "frac_reward_zero_std": 0.5, "grad_norm": 0.04452994838356972, "kl": 0.0, "learning_rate": 1.5217391304347825e-07, "loss": -0.0001, "num_tokens": 63935848.0, "reward": 1.0875000953674316, "reward_std": 0.030618607997894287, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 2067 }, { "completion_length": 1094.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1921.0, "completions/max_terminated_length": 1921.0, "completions/mean_length": 1094.916748046875, "completions/mean_terminated_length": 1094.916748046875, "completions/min_length": 512.0, "completions/min_terminated_length": 512.0, "epoch": 0.7014925373134329, "frac_reward_zero_std": 0.5, "grad_norm": 0.10390769690275192, "kl": 0.0, "learning_rate": 1.520013802622498e-07, "loss": 0.0014, "num_tokens": 63955947.0, "reward": 1.1666667461395264, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2068 }, { "completion_length": 737.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1094.0, "completions/max_terminated_length": 1094.0, "completions/mean_length": 737.4166870117188, "completions/mean_terminated_length": 737.4166870117188, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "epoch": 0.7018317503392131, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.518288474810214e-07, "loss": 0.0, "num_tokens": 63974090.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2069 }, { "completion_length": 824.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1468.0, "completions/max_terminated_length": 1468.0, "completions/mean_length": 824.8333740234375, "completions/mean_terminated_length": 824.8333740234375, "completions/min_length": 560.0, "completions/min_terminated_length": 560.0, "epoch": 0.7021709633649932, "frac_reward_zero_std": 0.5, "grad_norm": 0.0775667279958725, "kl": 0.0, "learning_rate": 1.5165631469979296e-07, "loss": -0.0003, "num_tokens": 63992526.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2070 }, { "completion_length": 1949.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 4630.0, "completions/max_terminated_length": 4630.0, "completions/mean_length": 1949.75, "completions/mean_terminated_length": 1949.75, "completions/min_length": 534.0, "completions/min_terminated_length": 534.0, "epoch": 0.7025101763907734, "frac_reward_zero_std": 1.0, "grad_norm": 1.2506330904216156e-07, "kl": 0.0, "learning_rate": 1.5148378191856452e-07, "loss": 0.0, "num_tokens": 64031007.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2071 }, { "completion_length": 1403.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3885.0, "completions/max_terminated_length": 3885.0, "completions/mean_length": 1403.166748046875, "completions/mean_terminated_length": 1403.166748046875, "completions/min_length": 394.0, "completions/min_terminated_length": 394.0, "epoch": 0.7028493894165536, "frac_reward_zero_std": 0.5, "grad_norm": 0.67208331823349, "kl": 0.0, "learning_rate": 1.513112491373361e-07, "loss": 0.0197, "num_tokens": 64064933.0, "reward": 1.2166666984558105, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2072 }, { "completion_length": 688.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 971.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 688.1666870117188, "completions/mean_terminated_length": 688.1666870117188, "completions/min_length": 477.0, "completions/min_terminated_length": 477.0, "epoch": 0.7031886024423338, "frac_reward_zero_std": 1.0, "grad_norm": 1.030124963108392e-07, "kl": 0.0, "learning_rate": 1.5113871635610765e-07, "loss": 0.0, "num_tokens": 64087051.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2073 }, { "completion_length": 901.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1857.0, "completions/max_terminated_length": 1857.0, "completions/mean_length": 901.4166870117188, "completions/mean_terminated_length": 901.4166870117188, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 0.7035278154681139, "frac_reward_zero_std": 0.5, "grad_norm": 0.06303401291370392, "kl": 0.0, "learning_rate": 1.5096618357487923e-07, "loss": 0.0002, "num_tokens": 64108434.0, "reward": 1.133333444595337, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2074 }, { "completion_length": 635.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 996.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 635.0833740234375, "completions/mean_terminated_length": 635.0833740234375, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "epoch": 0.7038670284938942, "frac_reward_zero_std": 0.5, "grad_norm": 0.06534280627965927, "kl": 0.0, "learning_rate": 1.5079365079365078e-07, "loss": -0.0007, "num_tokens": 64122871.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2075 }, { "completion_length": 940.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3071.0, "completions/max_terminated_length": 3071.0, "completions/mean_length": 940.8333740234375, "completions/mean_terminated_length": 940.8333740234375, "completions/min_length": 371.0, "completions/min_terminated_length": 371.0, "epoch": 0.7042062415196744, "frac_reward_zero_std": 0.5, "grad_norm": 0.47002753615379333, "kl": 0.0, "learning_rate": 1.5062111801242236e-07, "loss": 0.0154, "num_tokens": 64146209.0, "reward": 1.066666603088379, "reward_std": 0.20655910670757294, "rewards/correctness_reward_func/mean": 0.7666666507720947, "rewards/correctness_reward_func/std": 0.3700941801071167, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2076 }, { "completion_length": 966.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2010.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 966.0, "completions/mean_terminated_length": 966.0, "completions/min_length": 428.0, "completions/min_terminated_length": 428.0, "epoch": 0.7045454545454546, "frac_reward_zero_std": 0.0, "grad_norm": 0.10689998418092728, "kl": 0.0, "learning_rate": 1.5044858523119391e-07, "loss": -0.0013, "num_tokens": 64168481.0, "reward": 1.2666666507720947, "reward_std": 0.08164961636066437, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2077 }, { "completion_length": 888.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1568.0, "completions/max_terminated_length": 1568.0, "completions/mean_length": 888.9166870117188, "completions/mean_terminated_length": 888.9166870117188, "completions/min_length": 485.0, "completions/min_terminated_length": 485.0, "epoch": 0.7048846675712347, "frac_reward_zero_std": 1.0, "grad_norm": 2.4666860554134473e-07, "kl": 0.0, "learning_rate": 1.502760524499655e-07, "loss": 0.0, "num_tokens": 64193308.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2078 }, { "completion_length": 938.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2494.0, "completions/max_terminated_length": 2494.0, "completions/mean_length": 938.0833740234375, "completions/mean_terminated_length": 938.0833740234375, "completions/min_length": 499.0, "completions/min_terminated_length": 499.0, "epoch": 0.7052238805970149, "frac_reward_zero_std": 0.5, "grad_norm": 0.0583137571811676, "kl": 0.0, "learning_rate": 1.5010351966873705e-07, "loss": -0.0005, "num_tokens": 64220177.0, "reward": 1.1666667461395264, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2079 }, { "completion_length": 848.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1821.0, "completions/max_terminated_length": 1821.0, "completions/mean_length": 848.8333740234375, "completions/mean_terminated_length": 848.8333740234375, "completions/min_length": 474.0, "completions/min_terminated_length": 474.0, "epoch": 0.7055630936227951, "frac_reward_zero_std": 0.5, "grad_norm": 0.048346105962991714, "kl": 0.0, "learning_rate": 1.4993098688750862e-07, "loss": 0.0, "num_tokens": 64243149.0, "reward": 1.0875000953674316, "reward_std": 0.030618613585829735, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 2080 }, { "completion_length": 471.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 601.0, "completions/max_terminated_length": 601.0, "completions/mean_length": 471.8333435058594, "completions/mean_terminated_length": 471.8333435058594, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "epoch": 0.7059023066485753, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.4975845410628018e-07, "loss": 0.0, "num_tokens": 64261315.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2081 }, { "completion_length": 575.75, "completions/clipped_ratio": 0.0, "completions/max_length": 721.0, "completions/max_terminated_length": 721.0, "completions/mean_length": 575.75, "completions/mean_terminated_length": 575.75, "completions/min_length": 387.0, "completions/min_terminated_length": 387.0, "epoch": 0.7062415196743554, "frac_reward_zero_std": 0.5, "grad_norm": 0.0726902112364769, "kl": 0.0, "learning_rate": 1.4958592132505173e-07, "loss": 0.0006, "num_tokens": 64280992.0, "reward": 1.1500000953674316, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.8499999642372131, "rewards/correctness_reward_func/std": 0.09045340120792389, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2082 }, { "completion_length": 1867.0833740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4619.0, "completions/mean_length": 2416.166748046875, "completions/mean_terminated_length": 2036.8182373046875, "completions/min_length": 669.0, "completions/min_terminated_length": 669.0, "epoch": 0.7065807327001357, "frac_reward_zero_std": 0.5, "grad_norm": 0.05085520073771477, "kl": NaN, "learning_rate": 1.494133885438233e-07, "loss": -0.0051, "num_tokens": 64316525.0, "reward": 0.6875001788139343, "reward_std": 0.03061862289905548, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 2083 }, { "completion_length": 1204.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1904.0, "completions/max_terminated_length": 1904.0, "completions/mean_length": 1204.3333740234375, "completions/mean_terminated_length": 1204.3333740234375, "completions/min_length": 769.0, "completions/min_terminated_length": 769.0, "epoch": 0.7069199457259159, "frac_reward_zero_std": 0.5, "grad_norm": 0.10703430324792862, "kl": 0.0, "learning_rate": 1.4924085576259486e-07, "loss": -0.0006, "num_tokens": 64342089.0, "reward": 1.25, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2084 }, { "completion_length": 1352.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2885.0, "completions/max_terminated_length": 2885.0, "completions/mean_length": 1352.5, "completions/mean_terminated_length": 1352.5, "completions/min_length": 680.0, "completions/min_terminated_length": 680.0, "epoch": 0.7072591587516961, "frac_reward_zero_std": 0.5, "grad_norm": 0.0826234444975853, "kl": 0.0, "learning_rate": 1.4906832298136647e-07, "loss": -0.0014, "num_tokens": 64372617.0, "reward": 0.75, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.44999995827674866, "rewards/correctness_reward_func/std": 0.47577688097953796, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2085 }, { "completion_length": 2275.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3878.0, "completions/max_terminated_length": 3878.0, "completions/mean_length": 2275.666748046875, "completions/mean_terminated_length": 2275.666748046875, "completions/min_length": 1395.0, "completions/min_terminated_length": 1395.0, "epoch": 0.7075983717774763, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.4889579020013802e-07, "loss": 0.0, "num_tokens": 64409555.0, "reward": 0.30000001192092896, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2086 }, { "completion_length": 530.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 938.0, "completions/max_terminated_length": 938.0, "completions/mean_length": 530.4166870117188, "completions/mean_terminated_length": 530.4166870117188, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "epoch": 0.7079375848032564, "frac_reward_zero_std": 1.0, "grad_norm": 7.99445487587036e-08, "kl": 0.0, "learning_rate": 1.487232574189096e-07, "loss": 0.0, "num_tokens": 64432078.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2087 }, { "completion_length": 793.5000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1750.0, "completions/max_terminated_length": 1750.0, "completions/mean_length": 793.5, "completions/mean_terminated_length": 793.5, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.7082767978290366, "frac_reward_zero_std": 0.5, "grad_norm": 0.09639241546392441, "kl": 0.0, "learning_rate": 1.4855072463768115e-07, "loss": -0.0011, "num_tokens": 64455256.0, "reward": 1.2666666507720947, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2088 }, { "completion_length": 2623.25, "completions/clipped_ratio": 0.0, "completions/max_length": 6086.0, "completions/max_terminated_length": 6086.0, "completions/mean_length": 2623.25, "completions/mean_terminated_length": 2623.25, "completions/min_length": 340.0, "completions/min_terminated_length": 340.0, "epoch": 0.7086160108548168, "frac_reward_zero_std": 0.5, "grad_norm": 0.7291979789733887, "kl": 0.0, "learning_rate": 1.4837819185645273e-07, "loss": 0.017, "num_tokens": 64498381.0, "reward": 1.2166666984558105, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2089 }, { "completion_length": 817.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2567.0, "completions/max_terminated_length": 2567.0, "completions/mean_length": 817.1666870117188, "completions/mean_terminated_length": 817.1666870117188, "completions/min_length": 382.0, "completions/min_terminated_length": 382.0, "epoch": 0.7089552238805971, "frac_reward_zero_std": 0.0, "grad_norm": 0.4580311179161072, "kl": 0.0, "learning_rate": 1.4820565907522429e-07, "loss": 0.0051, "num_tokens": 64517193.0, "reward": 0.9333333373069763, "reward_std": 0.30983859300613403, "rewards/correctness_reward_func/mean": 0.6333333253860474, "rewards/correctness_reward_func/std": 0.4735424220561981, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2090 }, { "completion_length": 492.0833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 719.0, "completions/max_terminated_length": 719.0, "completions/mean_length": 492.0833435058594, "completions/mean_terminated_length": 492.0833435058594, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.7092944369063772, "frac_reward_zero_std": 0.0, "grad_norm": 0.08846572041511536, "kl": 0.0, "learning_rate": 1.4803312629399586e-07, "loss": -0.0003, "num_tokens": 64534912.0, "reward": 1.183333396911621, "reward_std": 0.09246458858251572, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2091 }, { "completion_length": 800.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1287.0, "completions/max_terminated_length": 1287.0, "completions/mean_length": 800.75, "completions/mean_terminated_length": 800.75, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.7096336499321574, "frac_reward_zero_std": 1.0, "grad_norm": 1.6590935558724595e-07, "kl": 0.0, "learning_rate": 1.4786059351276742e-07, "loss": 0.0, "num_tokens": 64553131.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2092 }, { "completion_length": 714.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 905.0, "completions/max_terminated_length": 905.0, "completions/mean_length": 714.4166870117188, "completions/mean_terminated_length": 714.4166870117188, "completions/min_length": 428.0, "completions/min_terminated_length": 428.0, "epoch": 0.7099728629579376, "frac_reward_zero_std": 1.0, "grad_norm": 1.6582923478836165e-07, "kl": 0.0, "learning_rate": 1.47688060731539e-07, "loss": 0.0, "num_tokens": 64572276.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2093 }, { "completion_length": 1446.916748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 3735.0, "completions/mean_length": 1996.0, "completions/mean_terminated_length": 1578.45458984375, "completions/min_length": 536.0, "completions/min_terminated_length": 536.0, "epoch": 0.7103120759837178, "frac_reward_zero_std": 0.0, "grad_norm": 0.10103930532932281, "kl": NaN, "learning_rate": 1.4751552795031055e-07, "loss": -0.0046, "num_tokens": 64604645.0, "reward": 0.7541667819023132, "reward_std": 0.08225837349891663, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 2094 }, { "completion_length": 1677.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3457.0, "completions/max_terminated_length": 3457.0, "completions/mean_length": 1677.25, "completions/mean_terminated_length": 1677.25, "completions/min_length": 671.0, "completions/min_terminated_length": 671.0, "epoch": 0.7106512890094979, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.473429951690821e-07, "loss": 0.0, "num_tokens": 64631354.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2095 }, { "completion_length": 828.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1346.0, "completions/max_terminated_length": 1346.0, "completions/mean_length": 828.6666870117188, "completions/mean_terminated_length": 828.6666870117188, "completions/min_length": 491.0, "completions/min_terminated_length": 491.0, "epoch": 0.7109905020352781, "frac_reward_zero_std": 0.5, "grad_norm": 0.08667231351137161, "kl": 0.0, "learning_rate": 1.4717046238785368e-07, "loss": 0.0004, "num_tokens": 64656604.0, "reward": 1.133333444595337, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2096 }, { "completion_length": 2200.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4854.0, "completions/max_terminated_length": 4854.0, "completions/mean_length": 2200.166748046875, "completions/mean_terminated_length": 2200.166748046875, "completions/min_length": 626.0, "completions/min_terminated_length": 626.0, "epoch": 0.7113297150610584, "frac_reward_zero_std": 0.5, "grad_norm": 0.48238351941108704, "kl": 0.0, "learning_rate": 1.4699792960662523e-07, "loss": 0.0037, "num_tokens": 64696962.0, "reward": 1.0333335399627686, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.7333333492279053, "rewards/correctness_reward_func/std": 0.23094011843204498, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2097 }, { "completion_length": 1246.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3955.0, "completions/max_terminated_length": 3955.0, "completions/mean_length": 1246.416748046875, "completions/mean_terminated_length": 1246.416748046875, "completions/min_length": 484.0, "completions/min_terminated_length": 484.0, "epoch": 0.7116689280868386, "frac_reward_zero_std": 0.5, "grad_norm": 0.4783247411251068, "kl": 0.0, "learning_rate": 1.4682539682539681e-07, "loss": 0.0214, "num_tokens": 64725377.0, "reward": 1.0333335399627686, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.7333333492279053, "rewards/correctness_reward_func/std": 0.23094011843204498, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2098 }, { "completion_length": 961.1667175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 1853.0, "completions/max_terminated_length": 1853.0, "completions/mean_length": 961.1666870117188, "completions/mean_terminated_length": 961.1666870117188, "completions/min_length": 496.0, "completions/min_terminated_length": 496.0, "epoch": 0.7120081411126187, "frac_reward_zero_std": 0.5, "grad_norm": 0.4418611526489258, "kl": 0.0, "learning_rate": 1.4665286404416837e-07, "loss": 0.001, "num_tokens": 64749139.0, "reward": 1.2166666984558105, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2099 }, { "completion_length": 890.5833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1719.0, "completions/max_terminated_length": 1719.0, "completions/mean_length": 890.5833740234375, "completions/mean_terminated_length": 890.5833740234375, "completions/min_length": 475.0, "completions/min_terminated_length": 475.0, "epoch": 0.7123473541383989, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.4648033126293997e-07, "loss": 0.0, "num_tokens": 64772756.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2100 }, { "completion_length": 1211.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2303.0, "completions/max_terminated_length": 2303.0, "completions/mean_length": 1211.3333740234375, "completions/mean_terminated_length": 1211.3333740234375, "completions/min_length": 692.0, "completions/min_terminated_length": 692.0, "epoch": 0.7126865671641791, "frac_reward_zero_std": 1.0, "grad_norm": 1.5068137315665808e-07, "kl": 0.0, "learning_rate": 1.4630779848171153e-07, "loss": 0.0, "num_tokens": 64798986.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2101 }, { "completion_length": 762.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1075.0, "completions/max_terminated_length": 1075.0, "completions/mean_length": 762.6666870117188, "completions/mean_terminated_length": 762.6666870117188, "completions/min_length": 498.0, "completions/min_terminated_length": 498.0, "epoch": 0.7130257801899593, "frac_reward_zero_std": 1.0, "grad_norm": 2.2349686901179666e-07, "kl": 0.0, "learning_rate": 1.461352657004831e-07, "loss": 0.0, "num_tokens": 64817258.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2102 }, { "completion_length": 760.0833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1602.0, "completions/max_terminated_length": 1602.0, "completions/mean_length": 760.0833740234375, "completions/mean_terminated_length": 760.0833740234375, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.7133649932157394, "frac_reward_zero_std": 0.5, "grad_norm": 0.06805427372455597, "kl": 0.0, "learning_rate": 1.4596273291925466e-07, "loss": -0.0001, "num_tokens": 64838637.0, "reward": 1.1500000953674316, "reward_std": 0.05477222427725792, "rewards/correctness_reward_func/mean": 0.8500000834465027, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2103 }, { "completion_length": 1927.8334350585938, "completions/clipped_ratio": 0.0, "completions/max_length": 4699.0, "completions/max_terminated_length": 4699.0, "completions/mean_length": 1927.8333740234375, "completions/mean_terminated_length": 1927.8333740234375, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 0.7137042062415196, "frac_reward_zero_std": 1.0, "grad_norm": 1.6258435664440185e-07, "kl": 0.0, "learning_rate": 1.4579020013802624e-07, "loss": 0.0, "num_tokens": 64871065.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2104 }, { "completion_length": 1432.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3949.0, "completions/max_terminated_length": 3949.0, "completions/mean_length": 1432.0, "completions/mean_terminated_length": 1432.0, "completions/min_length": 395.0, "completions/min_terminated_length": 395.0, "epoch": 0.7140434192672999, "frac_reward_zero_std": 0.5, "grad_norm": 0.574007511138916, "kl": 0.0, "learning_rate": 1.456176673567978e-07, "loss": 0.0208, "num_tokens": 64904989.0, "reward": 1.1666667461395264, "reward_std": 0.18618986010551453, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.287096232175827, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2105 }, { "completion_length": 955.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 1557.0, "completions/max_terminated_length": 1557.0, "completions/mean_length": 955.25, "completions/mean_terminated_length": 955.25, "completions/min_length": 444.0, "completions/min_terminated_length": 444.0, "epoch": 0.7143826322930801, "frac_reward_zero_std": 0.5, "grad_norm": 0.565056324005127, "kl": 0.0, "learning_rate": 1.4544513457556934e-07, "loss": -0.0101, "num_tokens": 64926064.0, "reward": 0.6333333253860474, "reward_std": 0.25819891691207886, "rewards/correctness_reward_func/mean": 0.3333333432674408, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2106 }, { "completion_length": 3050.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 6313.0, "completions/max_terminated_length": 6313.0, "completions/mean_length": 3050.666748046875, "completions/mean_terminated_length": 3050.666748046875, "completions/min_length": 715.0, "completions/min_terminated_length": 715.0, "epoch": 0.7147218453188603, "frac_reward_zero_std": 0.0, "grad_norm": 0.8644875884056091, "kl": 0.0, "learning_rate": 1.4527260179434092e-07, "loss": -0.0431, "num_tokens": 64974060.0, "reward": 1.0333333015441895, "reward_std": 0.3146860599517822, "rewards/correctness_reward_func/mean": 0.7333333492279053, "rewards/correctness_reward_func/std": 0.44585633277893066, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2107 }, { "completion_length": 2497.0834350585938, "completions/clipped_ratio": 0.0, "completions/max_length": 5293.0, "completions/max_terminated_length": 5293.0, "completions/mean_length": 2497.08349609375, "completions/mean_terminated_length": 2497.08349609375, "completions/min_length": 955.0, "completions/min_terminated_length": 955.0, "epoch": 0.7150610583446404, "frac_reward_zero_std": 0.5, "grad_norm": 0.5047267079353333, "kl": 0.0, "learning_rate": 1.4510006901311247e-07, "loss": -0.0083, "num_tokens": 65016013.0, "reward": 0.4333333671092987, "reward_std": 0.20655912160873413, "rewards/correctness_reward_func/mean": 0.13333334028720856, "rewards/correctness_reward_func/std": 0.31139957904815674, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2108 }, { "completion_length": 2198.0834350585938, "completions/clipped_ratio": 0.0, "completions/max_length": 5265.0, "completions/max_terminated_length": 5265.0, "completions/mean_length": 2198.08349609375, "completions/mean_terminated_length": 2198.08349609375, "completions/min_length": 510.0, "completions/min_terminated_length": 510.0, "epoch": 0.7154002713704206, "frac_reward_zero_std": 0.5, "grad_norm": 0.10537995398044586, "kl": 0.0, "learning_rate": 1.4492753623188405e-07, "loss": 0.0027, "num_tokens": 65055182.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2109 }, { "completion_length": 481.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 788.0, "completions/max_terminated_length": 788.0, "completions/mean_length": 481.3333435058594, "completions/mean_terminated_length": 481.3333435058594, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.7157394843962008, "frac_reward_zero_std": 1.0, "grad_norm": 8.387034711176966e-08, "kl": 0.0, "learning_rate": 1.447550034506556e-07, "loss": 0.0, "num_tokens": 65076138.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2110 }, { "completion_length": 1339.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3107.0, "completions/max_terminated_length": 3107.0, "completions/mean_length": 1339.166748046875, "completions/mean_terminated_length": 1339.166748046875, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.716078697421981, "frac_reward_zero_std": 1.0, "grad_norm": 1.454854867688482e-07, "kl": 0.0, "learning_rate": 1.4458247066942719e-07, "loss": 0.0, "num_tokens": 65102930.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2111 }, { "completion_length": 1067.5833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1509.0, "completions/max_terminated_length": 1509.0, "completions/mean_length": 1067.5833740234375, "completions/mean_terminated_length": 1067.5833740234375, "completions/min_length": 659.0, "completions/min_terminated_length": 659.0, "epoch": 0.7164179104477612, "frac_reward_zero_std": 1.0, "grad_norm": 1.1840645441907327e-07, "kl": 0.0, "learning_rate": 1.4440993788819874e-07, "loss": 0.0, "num_tokens": 65128539.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2112 }, { "completion_length": 914.5000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1856.0, "completions/max_terminated_length": 1856.0, "completions/mean_length": 914.5, "completions/mean_terminated_length": 914.5, "completions/min_length": 526.0, "completions/min_terminated_length": 526.0, "epoch": 0.7167571234735414, "frac_reward_zero_std": 1.0, "grad_norm": 1.5714941525857284e-07, "kl": 0.0, "learning_rate": 1.4423740510697032e-07, "loss": 0.0, "num_tokens": 65142987.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2113 }, { "completion_length": 671.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 880.0, "completions/max_terminated_length": 880.0, "completions/mean_length": 671.4166870117188, "completions/mean_terminated_length": 671.4166870117188, "completions/min_length": 545.0, "completions/min_terminated_length": 545.0, "epoch": 0.7170963364993216, "frac_reward_zero_std": 0.0, "grad_norm": 0.3106325566768646, "kl": 0.0, "learning_rate": 1.4406487232574187e-07, "loss": 0.0015, "num_tokens": 65162372.0, "reward": 1.1000001430511475, "reward_std": 0.23084041476249695, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.2696799635887146, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2114 }, { "completion_length": 656.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 854.0, "completions/max_terminated_length": 854.0, "completions/mean_length": 656.1666870117188, "completions/mean_terminated_length": 656.1666870117188, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "epoch": 0.7174355495251018, "frac_reward_zero_std": 1.0, "grad_norm": 1.1153619539072679e-07, "kl": 0.0, "learning_rate": 1.4389233954451345e-07, "loss": 0.0, "num_tokens": 65184436.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2115 }, { "completion_length": 479.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 792.0, "completions/max_terminated_length": 792.0, "completions/mean_length": 479.3333435058594, "completions/mean_terminated_length": 479.3333435058594, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "epoch": 0.7177747625508819, "frac_reward_zero_std": 0.5, "grad_norm": 0.10834439843893051, "kl": 0.0, "learning_rate": 1.4371980676328503e-07, "loss": -0.0002, "num_tokens": 65202434.0, "reward": 0.7333333492279053, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.43333330750465393, "rewards/correctness_reward_func/std": 0.45792683959007263, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2116 }, { "completion_length": 1488.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5029.0, "completions/max_terminated_length": 5029.0, "completions/mean_length": 1488.666748046875, "completions/mean_terminated_length": 1488.666748046875, "completions/min_length": 370.0, "completions/min_terminated_length": 370.0, "epoch": 0.7181139755766621, "frac_reward_zero_std": 0.5, "grad_norm": 0.36833080649375916, "kl": 0.0, "learning_rate": 1.4354727398205658e-07, "loss": 0.0092, "num_tokens": 65229724.0, "reward": 0.7666666507720947, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2117 }, { "completion_length": 1875.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3359.0, "completions/max_terminated_length": 3359.0, "completions/mean_length": 1875.25, "completions/mean_terminated_length": 1875.25, "completions/min_length": 608.0, "completions/min_terminated_length": 608.0, "epoch": 0.7184531886024423, "frac_reward_zero_std": 1.0, "grad_norm": 2.6469737690604234e-07, "kl": 0.0, "learning_rate": 1.4337474120082816e-07, "loss": 0.0, "num_tokens": 65262229.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2118 }, { "completion_length": 752.0000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1210.0, "completions/max_terminated_length": 1210.0, "completions/mean_length": 752.0, "completions/mean_terminated_length": 752.0, "completions/min_length": 450.0, "completions/min_terminated_length": 450.0, "epoch": 0.7187924016282226, "frac_reward_zero_std": 1.0, "grad_norm": 1.8070409169013146e-07, "kl": 0.0, "learning_rate": 1.4320220841959971e-07, "loss": 0.0, "num_tokens": 65285923.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2119 }, { "completion_length": 702.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1299.0, "completions/max_terminated_length": 1299.0, "completions/mean_length": 702.8333740234375, "completions/mean_terminated_length": 702.8333740234375, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.7191316146540027, "frac_reward_zero_std": 0.0, "grad_norm": 0.10900707542896271, "kl": 0.0, "learning_rate": 1.430296756383713e-07, "loss": -0.0007, "num_tokens": 65306033.0, "reward": 1.2333333492279053, "reward_std": 0.10327950119972229, "rewards/correctness_reward_func/mean": 0.9333333373069763, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2120 }, { "completion_length": 1083.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2030.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1083.3333740234375, "completions/mean_terminated_length": 1083.3333740234375, "completions/min_length": 500.0, "completions/min_terminated_length": 500.0, "epoch": 0.7194708276797829, "frac_reward_zero_std": 1.0, "grad_norm": 2.1387455717558623e-07, "kl": 0.0, "learning_rate": 1.4285714285714285e-07, "loss": 0.0, "num_tokens": 65330121.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2121 }, { "completion_length": 520.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 730.0, "completions/max_terminated_length": 730.0, "completions/mean_length": 520.0833740234375, "completions/mean_terminated_length": 520.0833740234375, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "epoch": 0.7198100407055631, "frac_reward_zero_std": 0.5, "grad_norm": 0.05039533972740173, "kl": 0.0, "learning_rate": 1.4268461007591443e-07, "loss": 0.0002, "num_tokens": 65348260.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2122 }, { "completion_length": 2684.7501220703125, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6427.0, "completions/mean_length": 3233.83349609375, "completions/mean_terminated_length": 2928.818359375, "completions/min_length": 659.0, "completions/min_terminated_length": 659.0, "epoch": 0.7201492537313433, "frac_reward_zero_std": 0.5, "grad_norm": 0.8266303539276123, "kl": NaN, "learning_rate": 1.4251207729468598e-07, "loss": -0.0539, "num_tokens": 65394379.0, "reward": 1.1083333492279053, "reward_std": 0.3006936311721802, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.3892494738101959, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 2123 }, { "completion_length": 822.5833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1025.0, "completions/max_terminated_length": 1025.0, "completions/mean_length": 822.5833740234375, "completions/mean_terminated_length": 822.5833740234375, "completions/min_length": 458.0, "completions/min_terminated_length": 458.0, "epoch": 0.7204884667571235, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.4233954451345756e-07, "loss": 0.0, "num_tokens": 65418302.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2124 }, { "completion_length": 987.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2105.0, "completions/max_terminated_length": 2105.0, "completions/mean_length": 987.25, "completions/mean_terminated_length": 987.25, "completions/min_length": 509.0, "completions/min_terminated_length": 509.0, "epoch": 0.7208276797829036, "frac_reward_zero_std": 0.5, "grad_norm": 0.0682312548160553, "kl": 0.0, "learning_rate": 1.421670117322291e-07, "loss": -0.0017, "num_tokens": 65446661.0, "reward": 0.7833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2125 }, { "completion_length": 516.25, "completions/clipped_ratio": 0.0, "completions/max_length": 619.0, "completions/max_terminated_length": 619.0, "completions/mean_length": 516.25, "completions/mean_terminated_length": 516.25, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "epoch": 0.7211668928086838, "frac_reward_zero_std": 0.0, "grad_norm": 0.0821535512804985, "kl": 0.0, "learning_rate": 1.419944789510007e-07, "loss": -0.0, "num_tokens": 65465042.0, "reward": 1.2000000476837158, "reward_std": 0.08164963126182556, "rewards/correctness_reward_func/mean": 0.9000000357627869, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2126 }, { "completion_length": 2148.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5866.0, "completions/max_terminated_length": 5866.0, "completions/mean_length": 2148.916748046875, "completions/mean_terminated_length": 2148.916748046875, "completions/min_length": 450.0, "completions/min_terminated_length": 450.0, "epoch": 0.7215061058344641, "frac_reward_zero_std": 0.5, "grad_norm": 0.5371212363243103, "kl": 0.0, "learning_rate": 1.4182194616977224e-07, "loss": -0.0008, "num_tokens": 65504683.0, "reward": 0.8833333849906921, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.5833333134651184, "rewards/correctness_reward_func/std": 0.5149286389350891, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2127 }, { "completion_length": 1846.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3619.0, "completions/max_terminated_length": 3619.0, "completions/mean_length": 1846.5, "completions/mean_terminated_length": 1846.5, "completions/min_length": 679.0, "completions/min_terminated_length": 679.0, "epoch": 0.7218453188602443, "frac_reward_zero_std": 0.0, "grad_norm": 0.8169918060302734, "kl": 0.0, "learning_rate": 1.416494133885438e-07, "loss": 0.0296, "num_tokens": 65538589.0, "reward": 0.8708333969116211, "reward_std": 0.44292864203453064, "rewards/correctness_reward_func/mean": 0.5833333134651184, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 2128 }, { "completion_length": 1489.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 2862.0, "completions/max_terminated_length": 2862.0, "completions/mean_length": 1489.916748046875, "completions/mean_terminated_length": 1489.916748046875, "completions/min_length": 554.0, "completions/min_terminated_length": 554.0, "epoch": 0.7221845318860244, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.4147688060731538e-07, "loss": 0.0, "num_tokens": 65566278.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2129 }, { "completion_length": 1026.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1668.0, "completions/max_terminated_length": 1668.0, "completions/mean_length": 1026.8333740234375, "completions/mean_terminated_length": 1026.8333740234375, "completions/min_length": 523.0, "completions/min_terminated_length": 523.0, "epoch": 0.7225237449118046, "frac_reward_zero_std": 0.0, "grad_norm": 0.11737686395645142, "kl": 0.0, "learning_rate": 1.4130434782608693e-07, "loss": -0.001, "num_tokens": 65591134.0, "reward": 1.183333396911621, "reward_std": 0.09246458858251572, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2130 }, { "completion_length": 707.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1148.0, "completions/max_terminated_length": 1148.0, "completions/mean_length": 707.3333740234375, "completions/mean_terminated_length": 707.3333740234375, "completions/min_length": 475.0, "completions/min_terminated_length": 475.0, "epoch": 0.7228629579375848, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.4113181504485853e-07, "loss": 0.0, "num_tokens": 65605316.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2131 }, { "completion_length": 1827.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3409.0, "completions/max_terminated_length": 3409.0, "completions/mean_length": 1827.0, "completions/mean_terminated_length": 1827.0, "completions/min_length": 1041.0, "completions/min_terminated_length": 1041.0, "epoch": 0.723202170963365, "frac_reward_zero_std": 0.5, "grad_norm": 0.4431602656841278, "kl": 0.0, "learning_rate": 1.409592822636301e-07, "loss": 0.0161, "num_tokens": 65642948.0, "reward": 0.6333333253860474, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.3333333432674408, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2132 }, { "completion_length": 860.5833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1335.0, "completions/max_terminated_length": 1335.0, "completions/mean_length": 860.5833740234375, "completions/mean_terminated_length": 860.5833740234375, "completions/min_length": 538.0, "completions/min_terminated_length": 538.0, "epoch": 0.7235413839891451, "frac_reward_zero_std": 0.5, "grad_norm": 0.08222872763872147, "kl": 0.0, "learning_rate": 1.4078674948240167e-07, "loss": -0.0007, "num_tokens": 65664837.0, "reward": 1.133333444595337, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2133 }, { "completion_length": 1494.3333740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5601.0, "completions/mean_length": 2043.416748046875, "completions/mean_terminated_length": 1630.181884765625, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 0.7238805970149254, "frac_reward_zero_std": 0.0, "grad_norm": 0.3825955092906952, "kl": NaN, "learning_rate": 1.4061421670117322e-07, "loss": -0.0311, "num_tokens": 65693593.0, "reward": 1.1041667461395264, "reward_std": 0.23474274575710297, "rewards/correctness_reward_func/mean": 0.8166666030883789, "rewards/correctness_reward_func/std": 0.27579087018966675, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 2134 }, { "completion_length": 1522.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3444.0, "completions/max_terminated_length": 3444.0, "completions/mean_length": 1522.916748046875, "completions/mean_terminated_length": 1522.916748046875, "completions/min_length": 506.0, "completions/min_terminated_length": 506.0, "epoch": 0.7242198100407056, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.404416839199448e-07, "loss": 0.0, "num_tokens": 65719782.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2135 }, { "completion_length": 1398.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3360.0, "completions/max_terminated_length": 3360.0, "completions/mean_length": 1398.916748046875, "completions/mean_terminated_length": 1398.916748046875, "completions/min_length": 665.0, "completions/min_terminated_length": 665.0, "epoch": 0.7245590230664858, "frac_reward_zero_std": 1.0, "grad_norm": 2.4048893010331085e-07, "kl": 0.0, "learning_rate": 1.4026915113871635e-07, "loss": 0.0, "num_tokens": 65748377.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2136 }, { "completion_length": 1324.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3169.0, "completions/max_terminated_length": 3169.0, "completions/mean_length": 1324.5, "completions/mean_terminated_length": 1324.5, "completions/min_length": 585.0, "completions/min_terminated_length": 585.0, "epoch": 0.7248982360922659, "frac_reward_zero_std": 0.0, "grad_norm": 0.32665780186653137, "kl": 0.0, "learning_rate": 1.4009661835748793e-07, "loss": 0.0021, "num_tokens": 65779049.0, "reward": 1.2000000476837158, "reward_std": 0.24494895339012146, "rewards/correctness_reward_func/mean": 0.9000000357627869, "rewards/correctness_reward_func/std": 0.28919950127601624, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2137 }, { "completion_length": 549.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 899.0, "completions/max_terminated_length": 899.0, "completions/mean_length": 549.6666870117188, "completions/mean_terminated_length": 549.6666870117188, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.7252374491180461, "frac_reward_zero_std": 0.5, "grad_norm": 0.06681986153125763, "kl": 0.0, "learning_rate": 1.3992408557625948e-07, "loss": 0.0001, "num_tokens": 65793415.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2138 }, { "completion_length": 797.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1429.0, "completions/max_terminated_length": 1429.0, "completions/mean_length": 797.5833740234375, "completions/mean_terminated_length": 797.5833740234375, "completions/min_length": 470.0, "completions/min_terminated_length": 470.0, "epoch": 0.7255766621438263, "frac_reward_zero_std": 0.5, "grad_norm": 0.05469308793544769, "kl": 0.0, "learning_rate": 1.3975155279503104e-07, "loss": -0.0004, "num_tokens": 65815352.0, "reward": 0.7833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2139 }, { "completion_length": 1866.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4029.0, "completions/max_terminated_length": 4029.0, "completions/mean_length": 1866.166748046875, "completions/mean_terminated_length": 1866.166748046875, "completions/min_length": 758.0, "completions/min_terminated_length": 758.0, "epoch": 0.7259158751696065, "frac_reward_zero_std": 0.0, "grad_norm": 0.8573922514915466, "kl": 0.0, "learning_rate": 1.3957902001380262e-07, "loss": 0.0168, "num_tokens": 65848450.0, "reward": 0.9000000357627869, "reward_std": 0.41908901929855347, "rewards/correctness_reward_func/mean": 0.5999999642372131, "rewards/correctness_reward_func/std": 0.45126086473464966, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2140 }, { "completion_length": 601.5833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 850.0, "completions/max_terminated_length": 850.0, "completions/mean_length": 601.5833740234375, "completions/mean_terminated_length": 601.5833740234375, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "epoch": 0.7262550881953868, "frac_reward_zero_std": 0.0, "grad_norm": 0.09630685299634933, "kl": 0.0, "learning_rate": 1.3940648723257417e-07, "loss": -0.0008, "num_tokens": 65866247.0, "reward": 1.2000000476837158, "reward_std": 0.10327951610088348, "rewards/correctness_reward_func/mean": 0.9000000357627869, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2141 }, { "completion_length": 1683.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3510.0, "completions/max_terminated_length": 3510.0, "completions/mean_length": 1683.916748046875, "completions/mean_terminated_length": 1683.916748046875, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "epoch": 0.7265943012211669, "frac_reward_zero_std": 0.5, "grad_norm": 0.39006340503692627, "kl": 0.0, "learning_rate": 1.3923395445134575e-07, "loss": 0.0087, "num_tokens": 65895142.0, "reward": 0.46666666865348816, "reward_std": 0.2581988573074341, "rewards/correctness_reward_func/mean": 0.1666666716337204, "rewards/correctness_reward_func/std": 0.3892494738101959, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2142 }, { "completion_length": 1236.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2111.0, "completions/max_terminated_length": 2111.0, "completions/mean_length": 1236.25, "completions/mean_terminated_length": 1236.25, "completions/min_length": 764.0, "completions/min_terminated_length": 764.0, "epoch": 0.7269335142469471, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.390614216701173e-07, "loss": 0.0, "num_tokens": 65925985.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2143 }, { "completion_length": 795.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1286.0, "completions/max_terminated_length": 1286.0, "completions/mean_length": 795.4166870117188, "completions/mean_terminated_length": 795.4166870117188, "completions/min_length": 495.0, "completions/min_terminated_length": 495.0, "epoch": 0.7272727272727273, "frac_reward_zero_std": 0.5, "grad_norm": 0.07595156878232956, "kl": 0.0, "learning_rate": 1.3888888888888888e-07, "loss": -0.0009, "num_tokens": 65949780.0, "reward": 1.2166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298573225736618, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2144 }, { "completion_length": 2901.5001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 5230.0, "completions/max_terminated_length": 5230.0, "completions/mean_length": 2901.5, "completions/mean_terminated_length": 2901.5, "completions/min_length": 1029.0, "completions/min_terminated_length": 1029.0, "epoch": 0.7276119402985075, "frac_reward_zero_std": 0.5, "grad_norm": 0.07734272629022598, "kl": 0.0, "learning_rate": 1.3871635610766043e-07, "loss": 0.0004, "num_tokens": 65996976.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2145 }, { "completion_length": 625.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1242.0, "completions/max_terminated_length": 1242.0, "completions/mean_length": 625.25, "completions/mean_terminated_length": 625.25, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.7279511533242876, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.38543823326432e-07, "loss": 0.0, "num_tokens": 66015771.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2146 }, { "completion_length": 1090.5833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 2602.0, "completions/max_terminated_length": 2602.0, "completions/mean_length": 1090.5833740234375, "completions/mean_terminated_length": 1090.5833740234375, "completions/min_length": 345.0, "completions/min_terminated_length": 345.0, "epoch": 0.7282903663500678, "frac_reward_zero_std": 0.5, "grad_norm": 0.06336503475904465, "kl": 0.0, "learning_rate": 1.383712905452036e-07, "loss": -0.0011, "num_tokens": 66043252.0, "reward": 0.7833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2147 }, { "completion_length": 716.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1278.0, "completions/max_terminated_length": 1278.0, "completions/mean_length": 716.6666870117188, "completions/mean_terminated_length": 716.6666870117188, "completions/min_length": 366.0, "completions/min_terminated_length": 366.0, "epoch": 0.728629579375848, "frac_reward_zero_std": 1.0, "grad_norm": 1.5365445449333492e-07, "kl": 0.0, "learning_rate": 1.3819875776397517e-07, "loss": 0.0, "num_tokens": 66061800.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2148 }, { "completion_length": 1084.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2440.0, "completions/max_terminated_length": 2440.0, "completions/mean_length": 1084.0833740234375, "completions/mean_terminated_length": 1084.0833740234375, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "epoch": 0.7289687924016283, "frac_reward_zero_std": 0.5, "grad_norm": 0.4173569679260254, "kl": 0.0, "learning_rate": 1.3802622498274672e-07, "loss": -0.0002, "num_tokens": 66084103.0, "reward": 1.183333396911621, "reward_std": 0.19407902657985687, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2149 }, { "completion_length": 878.5000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1402.0, "completions/max_terminated_length": 1402.0, "completions/mean_length": 878.5, "completions/mean_terminated_length": 878.5, "completions/min_length": 428.0, "completions/min_terminated_length": 428.0, "epoch": 0.7293080054274084, "frac_reward_zero_std": 0.5, "grad_norm": 0.0738079845905304, "kl": 0.0, "learning_rate": 1.378536922015183e-07, "loss": 0.0007, "num_tokens": 66108583.0, "reward": 1.2333333492279053, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.9333333373069763, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2150 }, { "completion_length": 2521.8333740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5980.0, "completions/mean_length": 3070.916748046875, "completions/mean_terminated_length": 2751.091064453125, "completions/min_length": 713.0, "completions/min_terminated_length": 713.0, "epoch": 0.7296472184531886, "frac_reward_zero_std": 0.0, "grad_norm": 0.45419785380363464, "kl": NaN, "learning_rate": 1.3768115942028986e-07, "loss": -0.0023, "num_tokens": 66152177.0, "reward": 0.770833432674408, "reward_std": 0.21288345754146576, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.43029236793518066, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 2151 }, { "completion_length": 640.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 976.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 640.5833740234375, "completions/mean_terminated_length": 640.5833740234375, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "epoch": 0.7299864314789688, "frac_reward_zero_std": 0.5, "grad_norm": 0.2223428636789322, "kl": 0.0, "learning_rate": 1.375086266390614e-07, "loss": -0.0022, "num_tokens": 66169482.0, "reward": 0.6333333253860474, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.3333333432674408, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2152 }, { "completion_length": 530.8333587646484, "completions/clipped_ratio": 0.0, "completions/max_length": 1224.0, "completions/max_terminated_length": 1224.0, "completions/mean_length": 530.8333740234375, "completions/mean_terminated_length": 530.8333740234375, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.730325644504749, "frac_reward_zero_std": 0.5, "grad_norm": 0.059700001031160355, "kl": 0.0, "learning_rate": 1.37336093857833e-07, "loss": -0.0004, "num_tokens": 66189676.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2153 }, { "completion_length": 2434.2501220703125, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 6065.0, "completions/mean_length": 3532.416748046875, "completions/mean_terminated_length": 2921.10009765625, "completions/min_length": 724.0, "completions/min_terminated_length": 724.0, "epoch": 0.7306648575305291, "frac_reward_zero_std": 0.5, "grad_norm": 0.22037464380264282, "kl": NaN, "learning_rate": 1.3716356107660454e-07, "loss": -0.0296, "num_tokens": 66230791.0, "reward": 0.6500000953674316, "reward_std": 0.0774596780538559, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.11677484214305878, "step": 2154 }, { "completion_length": 1561.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2441.0, "completions/max_terminated_length": 2441.0, "completions/mean_length": 1561.5, "completions/mean_terminated_length": 1561.5, "completions/min_length": 494.0, "completions/min_terminated_length": 494.0, "epoch": 0.7310040705563093, "frac_reward_zero_std": 0.0, "grad_norm": 0.7886664271354675, "kl": 0.0, "learning_rate": 1.3699102829537612e-07, "loss": -0.0005, "num_tokens": 66260365.0, "reward": 0.9000000357627869, "reward_std": 0.36985844373703003, "rewards/correctness_reward_func/mean": 0.6000000238418579, "rewards/correctness_reward_func/std": 0.36181360483169556, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2155 }, { "completion_length": 565.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 810.0, "completions/max_terminated_length": 810.0, "completions/mean_length": 565.1666870117188, "completions/mean_terminated_length": 565.1666870117188, "completions/min_length": 450.0, "completions/min_terminated_length": 450.0, "epoch": 0.7313432835820896, "frac_reward_zero_std": 0.5, "grad_norm": 0.0518721304833889, "kl": 0.0, "learning_rate": 1.3681849551414767e-07, "loss": -0.0003, "num_tokens": 66279195.0, "reward": 1.133333444595337, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2156 }, { "completion_length": 1691.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3624.0, "completions/max_terminated_length": 3624.0, "completions/mean_length": 1691.75, "completions/mean_terminated_length": 1691.75, "completions/min_length": 692.0, "completions/min_terminated_length": 692.0, "epoch": 0.7316824966078698, "frac_reward_zero_std": 0.5, "grad_norm": 0.536673903465271, "kl": 0.0, "learning_rate": 1.3664596273291925e-07, "loss": -0.0027, "num_tokens": 66308832.0, "reward": 1.2166666984558105, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2157 }, { "completion_length": 1454.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5192.0, "completions/max_terminated_length": 5192.0, "completions/mean_length": 1454.25, "completions/mean_terminated_length": 1454.25, "completions/min_length": 465.0, "completions/min_terminated_length": 465.0, "epoch": 0.73202170963365, "frac_reward_zero_std": 0.0, "grad_norm": 0.8344326019287109, "kl": 0.0, "learning_rate": 1.364734299516908e-07, "loss": 0.0366, "num_tokens": 66336669.0, "reward": 0.7833333015441895, "reward_std": 0.4041241407394409, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2158 }, { "completion_length": 607.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1006.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 607.4166870117188, "completions/mean_terminated_length": 607.4166870117188, "completions/min_length": 430.0, "completions/min_terminated_length": 430.0, "epoch": 0.7323609226594301, "frac_reward_zero_std": 0.5, "grad_norm": 0.21252770721912384, "kl": 0.0, "learning_rate": 1.3630089717046238e-07, "loss": -0.0013, "num_tokens": 66355118.0, "reward": 0.6333333253860474, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.3333333432674408, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2159 }, { "completion_length": 1587.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 4439.0, "completions/max_terminated_length": 4439.0, "completions/mean_length": 1587.166748046875, "completions/mean_terminated_length": 1587.166748046875, "completions/min_length": 796.0, "completions/min_terminated_length": 796.0, "epoch": 0.7327001356852103, "frac_reward_zero_std": 1.0, "grad_norm": 1.4759773137029697e-07, "kl": 0.0, "learning_rate": 1.3612836438923394e-07, "loss": 0.0, "num_tokens": 66387688.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2160 }, { "completion_length": 1063.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2278.0, "completions/max_terminated_length": 2278.0, "completions/mean_length": 1063.75, "completions/mean_terminated_length": 1063.75, "completions/min_length": 416.0, "completions/min_terminated_length": 416.0, "epoch": 0.7330393487109905, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.3595583160800552e-07, "loss": 0.0, "num_tokens": 66412975.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2161 }, { "completion_length": 477.91668701171875, "completions/clipped_ratio": 0.0, "completions/max_length": 772.0, "completions/max_terminated_length": 772.0, "completions/mean_length": 477.91668701171875, "completions/mean_terminated_length": 477.91668701171875, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.7333785617367707, "frac_reward_zero_std": 0.5, "grad_norm": 0.05666116252541542, "kl": 0.0, "learning_rate": 1.3578329882677707e-07, "loss": 0.0, "num_tokens": 66430320.0, "reward": 1.1666667461395264, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2162 }, { "completion_length": 881.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2175.0, "completions/max_terminated_length": 2175.0, "completions/mean_length": 881.5833740234375, "completions/mean_terminated_length": 881.5833740234375, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 0.7337177747625508, "frac_reward_zero_std": 1.0, "grad_norm": 1.1340380723368071e-07, "kl": 0.0, "learning_rate": 1.3561076604554865e-07, "loss": 0.0, "num_tokens": 66453109.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2163 }, { "completion_length": 712.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 981.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 712.0833740234375, "completions/mean_terminated_length": 712.0833740234375, "completions/min_length": 413.0, "completions/min_terminated_length": 413.0, "epoch": 0.7340569877883311, "frac_reward_zero_std": 0.5, "grad_norm": 0.07631095498800278, "kl": 0.0, "learning_rate": 1.3543823326432023e-07, "loss": -0.0002, "num_tokens": 66473012.0, "reward": 1.2166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298573225736618, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2164 }, { "completion_length": 834.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1659.0, "completions/max_terminated_length": 1659.0, "completions/mean_length": 834.4166870117188, "completions/mean_terminated_length": 834.4166870117188, "completions/min_length": 448.0, "completions/min_terminated_length": 448.0, "epoch": 0.7343962008141113, "frac_reward_zero_std": 0.5, "grad_norm": 0.06574416905641556, "kl": 0.0, "learning_rate": 1.3526570048309178e-07, "loss": -0.0002, "num_tokens": 66496513.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2165 }, { "completion_length": 695.0000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1225.0, "completions/max_terminated_length": 1225.0, "completions/mean_length": 695.0, "completions/mean_terminated_length": 695.0, "completions/min_length": 469.0, "completions/min_terminated_length": 469.0, "epoch": 0.7347354138398915, "frac_reward_zero_std": 0.5, "grad_norm": 0.07982386648654938, "kl": 0.0, "learning_rate": 1.3509316770186336e-07, "loss": 0.0003, "num_tokens": 66517315.0, "reward": 1.133333444595337, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2166 }, { "completion_length": 760.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1159.0, "completions/max_terminated_length": 1159.0, "completions/mean_length": 760.8333740234375, "completions/mean_terminated_length": 760.8333740234375, "completions/min_length": 464.0, "completions/min_terminated_length": 464.0, "epoch": 0.7350746268656716, "frac_reward_zero_std": 0.5, "grad_norm": 0.0723530724644661, "kl": 0.0, "learning_rate": 1.349206349206349e-07, "loss": 0.0, "num_tokens": 66537557.0, "reward": 1.25, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2167 }, { "completion_length": 2133.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3912.0, "completions/max_terminated_length": 3912.0, "completions/mean_length": 2133.416748046875, "completions/mean_terminated_length": 2133.416748046875, "completions/min_length": 1349.0, "completions/min_terminated_length": 1349.0, "epoch": 0.7354138398914518, "frac_reward_zero_std": 1.0, "grad_norm": 1.6219779297443893e-07, "kl": 0.0, "learning_rate": 1.347481021394065e-07, "loss": 0.0, "num_tokens": 66580720.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2168 }, { "completion_length": 972.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1363.0, "completions/max_terminated_length": 1363.0, "completions/mean_length": 972.25, "completions/mean_terminated_length": 972.25, "completions/min_length": 603.0, "completions/min_terminated_length": 603.0, "epoch": 0.735753052917232, "frac_reward_zero_std": 0.0, "grad_norm": 0.43157312273979187, "kl": 0.0, "learning_rate": 1.3457556935817804e-07, "loss": -0.0024, "num_tokens": 66609073.0, "reward": 1.0499999523162842, "reward_std": 0.24738392233848572, "rewards/correctness_reward_func/mean": 0.75, "rewards/correctness_reward_func/std": 0.36306774616241455, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2169 }, { "completion_length": 623.6666717529297, "completions/clipped_ratio": 0.0, "completions/max_length": 1239.0, "completions/max_terminated_length": 1239.0, "completions/mean_length": 623.6666870117188, "completions/mean_terminated_length": 623.6666870117188, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.7360922659430122, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.3440303657694962e-07, "loss": 0.0, "num_tokens": 66628935.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2170 }, { "completion_length": 408.6666717529297, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 408.66668701171875, "completions/mean_terminated_length": 408.66668701171875, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.7364314789687924, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.3423050379572118e-07, "loss": 0.0, "num_tokens": 66649829.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2171 }, { "completion_length": 1740.8333740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4894.0, "completions/mean_length": 2289.916748046875, "completions/mean_terminated_length": 1899.0909423828125, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.7367706919945726, "frac_reward_zero_std": 0.5, "grad_norm": 0.09222375601530075, "kl": NaN, "learning_rate": 1.3405797101449276e-07, "loss": -0.0117, "num_tokens": 66680919.0, "reward": 0.7749999761581421, "reward_std": 0.06123724579811096, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 2172 }, { "completion_length": 549.5000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 720.0, "completions/max_terminated_length": 720.0, "completions/mean_length": 549.5, "completions/mean_terminated_length": 549.5, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "epoch": 0.7371099050203528, "frac_reward_zero_std": 1.0, "grad_norm": 1.216678526816395e-07, "kl": 0.0, "learning_rate": 1.338854382332643e-07, "loss": 0.0, "num_tokens": 66700995.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2173 }, { "completion_length": 1029.6667175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 2282.0, "completions/max_terminated_length": 2282.0, "completions/mean_length": 1029.666748046875, "completions/mean_terminated_length": 1029.666748046875, "completions/min_length": 477.0, "completions/min_terminated_length": 477.0, "epoch": 0.737449118046133, "frac_reward_zero_std": 0.5, "grad_norm": 0.05216016620397568, "kl": 0.0, "learning_rate": 1.3371290545203586e-07, "loss": -0.0001, "num_tokens": 66723257.0, "reward": 0.7166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2174 }, { "completion_length": 967.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1679.0, "completions/max_terminated_length": 1679.0, "completions/mean_length": 967.5833740234375, "completions/mean_terminated_length": 967.5833740234375, "completions/min_length": 555.0, "completions/min_terminated_length": 555.0, "epoch": 0.7377883310719131, "frac_reward_zero_std": 0.5, "grad_norm": 0.07647698372602463, "kl": 0.0, "learning_rate": 1.3354037267080744e-07, "loss": 0.0013, "num_tokens": 66747330.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2175 }, { "completion_length": 731.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1311.0, "completions/max_terminated_length": 1311.0, "completions/mean_length": 731.75, "completions/mean_terminated_length": 731.75, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.7381275440976933, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.33367839889579e-07, "loss": 0.0, "num_tokens": 66767547.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2176 }, { "completion_length": 925.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2621.0, "completions/max_terminated_length": 2621.0, "completions/mean_length": 925.6666870117188, "completions/mean_terminated_length": 925.6666870117188, "completions/min_length": 441.0, "completions/min_terminated_length": 441.0, "epoch": 0.7384667571234735, "frac_reward_zero_std": 0.5, "grad_norm": 0.06673499196767807, "kl": 0.0, "learning_rate": 1.3319530710835057e-07, "loss": -0.0, "num_tokens": 66790595.0, "reward": 1.2666666507720947, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2177 }, { "completion_length": 997.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1879.0, "completions/max_terminated_length": 1879.0, "completions/mean_length": 997.6666870117188, "completions/mean_terminated_length": 997.6666870117188, "completions/min_length": 512.0, "completions/min_terminated_length": 512.0, "epoch": 0.7388059701492538, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.3302277432712215e-07, "loss": 0.0, "num_tokens": 66812569.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2178 }, { "completion_length": 893.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1283.0, "completions/max_terminated_length": 1283.0, "completions/mean_length": 893.3333740234375, "completions/mean_terminated_length": 893.3333740234375, "completions/min_length": 572.0, "completions/min_terminated_length": 572.0, "epoch": 0.739145183175034, "frac_reward_zero_std": 0.5, "grad_norm": 0.09296050667762756, "kl": 0.0, "learning_rate": 1.3285024154589373e-07, "loss": -0.0001, "num_tokens": 66836201.0, "reward": 1.2333333492279053, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.9333333373069763, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2179 }, { "completion_length": 563.75, "completions/clipped_ratio": 0.0, "completions/max_length": 717.0, "completions/max_terminated_length": 717.0, "completions/mean_length": 563.75, "completions/mean_terminated_length": 563.75, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "epoch": 0.7394843962008141, "frac_reward_zero_std": 0.5, "grad_norm": 0.07053308188915253, "kl": 0.0, "learning_rate": 1.3267770876466528e-07, "loss": -0.0001, "num_tokens": 66855140.0, "reward": 1.25, "reward_std": 0.05477222427725792, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2180 }, { "completion_length": 573.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1014.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 573.8333740234375, "completions/mean_terminated_length": 573.8333740234375, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.7398236092265943, "frac_reward_zero_std": 1.0, "grad_norm": 1.3899068562750472e-07, "kl": 0.0, "learning_rate": 1.3250517598343686e-07, "loss": 0.0, "num_tokens": 66874212.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2181 }, { "completion_length": 707.5000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 970.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 707.5, "completions/mean_terminated_length": 707.5, "completions/min_length": 431.0, "completions/min_terminated_length": 431.0, "epoch": 0.7401628222523745, "frac_reward_zero_std": 1.0, "grad_norm": 2.3253267045220127e-07, "kl": 0.0, "learning_rate": 1.3233264320220842e-07, "loss": 0.0, "num_tokens": 66895014.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2182 }, { "completion_length": 1071.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2818.0, "completions/max_terminated_length": 2818.0, "completions/mean_length": 1071.8333740234375, "completions/mean_terminated_length": 1071.8333740234375, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "epoch": 0.7405020352781547, "frac_reward_zero_std": 0.5, "grad_norm": 0.09270206838846207, "kl": 0.0, "learning_rate": 1.3216011042098e-07, "loss": 0.0045, "num_tokens": 66921214.0, "reward": 0.7666666507720947, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2183 }, { "completion_length": 863.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2356.0, "completions/max_terminated_length": 2356.0, "completions/mean_length": 863.75, "completions/mean_terminated_length": 863.75, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 0.7408412483039348, "frac_reward_zero_std": 0.5, "grad_norm": 0.5436593294143677, "kl": 0.0, "learning_rate": 1.3198757763975155e-07, "loss": -0.0029, "num_tokens": 66944395.0, "reward": 1.066666603088379, "reward_std": 0.20655912160873413, "rewards/correctness_reward_func/mean": 0.7666666507720947, "rewards/correctness_reward_func/std": 0.3700941801071167, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2184 }, { "completion_length": 832.0833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1358.0, "completions/max_terminated_length": 1358.0, "completions/mean_length": 832.0833740234375, "completions/mean_terminated_length": 832.0833740234375, "completions/min_length": 583.0, "completions/min_terminated_length": 583.0, "epoch": 0.741180461329715, "frac_reward_zero_std": 0.5, "grad_norm": 0.07787755876779556, "kl": 0.0, "learning_rate": 1.318150448585231e-07, "loss": 0.0004, "num_tokens": 66964442.0, "reward": 0.75, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.44999995827674866, "rewards/correctness_reward_func/std": 0.47577688097953796, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2185 }, { "completion_length": 2427.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 5132.0, "completions/max_terminated_length": 5132.0, "completions/mean_length": 2427.5, "completions/mean_terminated_length": 2427.5, "completions/min_length": 669.0, "completions/min_terminated_length": 669.0, "epoch": 0.7415196743554953, "frac_reward_zero_std": 0.5, "grad_norm": 0.48977670073509216, "kl": 0.0, "learning_rate": 1.3164251207729468e-07, "loss": 0.0072, "num_tokens": 67010024.0, "reward": 1.0333335399627686, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.7333333492279053, "rewards/correctness_reward_func/std": 0.2309401035308838, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2186 }, { "completion_length": 2030.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3895.0, "completions/max_terminated_length": 3895.0, "completions/mean_length": 2030.916748046875, "completions/mean_terminated_length": 2030.916748046875, "completions/min_length": 1094.0, "completions/min_terminated_length": 1094.0, "epoch": 0.7418588873812755, "frac_reward_zero_std": 0.0, "grad_norm": 0.8428034782409668, "kl": 0.0, "learning_rate": 1.3146997929606623e-07, "loss": -0.0214, "num_tokens": 67045723.0, "reward": 0.8333333730697632, "reward_std": 0.3823883533477783, "rewards/correctness_reward_func/mean": 0.5333333611488342, "rewards/correctness_reward_func/std": 0.3938928246498108, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2187 }, { "completion_length": 853.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1405.0, "completions/max_terminated_length": 1405.0, "completions/mean_length": 853.25, "completions/mean_terminated_length": 853.25, "completions/min_length": 464.0, "completions/min_terminated_length": 464.0, "epoch": 0.7421981004070556, "frac_reward_zero_std": 0.5, "grad_norm": 0.05179527401924133, "kl": 0.0, "learning_rate": 1.312974465148378e-07, "loss": -0.0004, "num_tokens": 67069570.0, "reward": 0.7833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2188 }, { "completion_length": 1312.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2705.0, "completions/max_terminated_length": 2705.0, "completions/mean_length": 1312.416748046875, "completions/mean_terminated_length": 1312.416748046875, "completions/min_length": 469.0, "completions/min_terminated_length": 469.0, "epoch": 0.7425373134328358, "frac_reward_zero_std": 0.0, "grad_norm": 0.3336423635482788, "kl": 0.0, "learning_rate": 1.3112491373360937e-07, "loss": 0.0045, "num_tokens": 67094217.0, "reward": 0.7708333730697632, "reward_std": 0.21288344264030457, "rewards/correctness_reward_func/mean": 0.4833333194255829, "rewards/correctness_reward_func/std": 0.43029236793518066, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 2189 }, { "completion_length": 1204.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2368.0, "completions/max_terminated_length": 2368.0, "completions/mean_length": 1204.0, "completions/mean_terminated_length": 1204.0, "completions/min_length": 527.0, "completions/min_terminated_length": 527.0, "epoch": 0.742876526458616, "frac_reward_zero_std": 1.0, "grad_norm": 1.7547486663715972e-07, "kl": 0.0, "learning_rate": 1.3095238095238095e-07, "loss": 0.0, "num_tokens": 67121655.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2190 }, { "completion_length": 1316.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2660.0, "completions/max_terminated_length": 2660.0, "completions/mean_length": 1316.0, "completions/mean_terminated_length": 1316.0, "completions/min_length": 428.0, "completions/min_terminated_length": 428.0, "epoch": 0.7432157394843962, "frac_reward_zero_std": 0.5, "grad_norm": 0.13136065006256104, "kl": 0.0, "learning_rate": 1.307798481711525e-07, "loss": -0.0007, "num_tokens": 67147521.0, "reward": 1.1666667461395264, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2191 }, { "completion_length": 826.0833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 2020.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 826.0833740234375, "completions/mean_terminated_length": 826.0833740234375, "completions/min_length": 377.0, "completions/min_terminated_length": 377.0, "epoch": 0.7435549525101763, "frac_reward_zero_std": 0.5, "grad_norm": 0.08031333237886429, "kl": 0.0, "learning_rate": 1.3060731538992408e-07, "loss": -0.0002, "num_tokens": 67168930.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2192 }, { "completion_length": 906.0833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1342.0, "completions/max_terminated_length": 1342.0, "completions/mean_length": 906.0833740234375, "completions/mean_terminated_length": 906.0833740234375, "completions/min_length": 594.0, "completions/min_terminated_length": 594.0, "epoch": 0.7438941655359566, "frac_reward_zero_std": 0.5, "grad_norm": 0.09493644535541534, "kl": 0.0, "learning_rate": 1.3043478260869563e-07, "loss": 0.0003, "num_tokens": 67190807.0, "reward": 1.1666667461395264, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2193 }, { "completion_length": 1107.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1818.0, "completions/max_terminated_length": 1818.0, "completions/mean_length": 1107.25, "completions/mean_terminated_length": 1107.25, "completions/min_length": 656.0, "completions/min_terminated_length": 656.0, "epoch": 0.7442333785617368, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.3026224982746724e-07, "loss": 0.0, "num_tokens": 67211690.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2194 }, { "completion_length": 1328.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2925.0, "completions/max_terminated_length": 2925.0, "completions/mean_length": 1328.75, "completions/mean_terminated_length": 1328.75, "completions/min_length": 524.0, "completions/min_terminated_length": 524.0, "epoch": 0.744572591587517, "frac_reward_zero_std": 0.5, "grad_norm": 0.5979235172271729, "kl": 0.0, "learning_rate": 1.300897170462388e-07, "loss": 0.0186, "num_tokens": 67240343.0, "reward": 0.8666667342185974, "reward_std": 0.2581988573074341, "rewards/correctness_reward_func/mean": 0.5666666626930237, "rewards/correctness_reward_func/std": 0.42497774958610535, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2195 }, { "completion_length": 951.5833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1294.0, "completions/max_terminated_length": 1294.0, "completions/mean_length": 951.5833740234375, "completions/mean_terminated_length": 951.5833740234375, "completions/min_length": 681.0, "completions/min_terminated_length": 681.0, "epoch": 0.7449118046132972, "frac_reward_zero_std": 1.0, "grad_norm": 9.799709488333974e-08, "kl": 0.0, "learning_rate": 1.2991718426501034e-07, "loss": 0.0, "num_tokens": 67264848.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2196 }, { "completion_length": 855.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1253.0, "completions/max_terminated_length": 1253.0, "completions/mean_length": 855.1666870117188, "completions/mean_terminated_length": 855.1666870117188, "completions/min_length": 522.0, "completions/min_terminated_length": 522.0, "epoch": 0.7452510176390773, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.2974465148378192e-07, "loss": 0.0, "num_tokens": 67284890.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2197 }, { "completion_length": 715.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1032.0, "completions/max_terminated_length": 1032.0, "completions/mean_length": 715.9166870117188, "completions/mean_terminated_length": 715.9166870117188, "completions/min_length": 487.0, "completions/min_terminated_length": 487.0, "epoch": 0.7455902306648575, "frac_reward_zero_std": 1.0, "grad_norm": 1.701045135860113e-07, "kl": 0.0, "learning_rate": 1.2957211870255347e-07, "loss": 0.0, "num_tokens": 67304197.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2198 }, { "completion_length": 1488.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2766.0, "completions/max_terminated_length": 2766.0, "completions/mean_length": 1488.416748046875, "completions/mean_terminated_length": 1488.416748046875, "completions/min_length": 684.0, "completions/min_terminated_length": 684.0, "epoch": 0.7459294436906377, "frac_reward_zero_std": 0.5, "grad_norm": 0.4703347086906433, "kl": 0.0, "learning_rate": 1.2939958592132505e-07, "loss": 0.0098, "num_tokens": 67330476.0, "reward": 1.133333444595337, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.2806917726993561, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2199 }, { "completion_length": 1980.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 6438.0, "completions/max_terminated_length": 6438.0, "completions/mean_length": 1980.916748046875, "completions/mean_terminated_length": 1980.916748046875, "completions/min_length": 602.0, "completions/min_terminated_length": 602.0, "epoch": 0.746268656716418, "frac_reward_zero_std": 1.0, "grad_norm": 3.182997829753731e-07, "kl": 0.0, "learning_rate": 1.292270531400966e-07, "loss": 0.0, "num_tokens": 67366883.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2200 }, { "completion_length": 684.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1117.0, "completions/max_terminated_length": 1117.0, "completions/mean_length": 684.8333740234375, "completions/mean_terminated_length": 684.8333740234375, "completions/min_length": 483.0, "completions/min_terminated_length": 483.0, "epoch": 0.7466078697421981, "frac_reward_zero_std": 1.0, "grad_norm": 8.352996871963114e-08, "kl": 0.0, "learning_rate": 1.2905452035886819e-07, "loss": 0.0, "num_tokens": 67386075.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2201 }, { "completion_length": 1269.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2293.0, "completions/max_terminated_length": 2293.0, "completions/mean_length": 1269.416748046875, "completions/mean_terminated_length": 1269.416748046875, "completions/min_length": 721.0, "completions/min_terminated_length": 721.0, "epoch": 0.7469470827679783, "frac_reward_zero_std": 0.5, "grad_norm": 0.10603316873311996, "kl": 0.0, "learning_rate": 1.2888198757763974e-07, "loss": 0.0003, "num_tokens": 67409384.0, "reward": 1.2666666507720947, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2202 }, { "completion_length": 647.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1022.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 647.4166870117188, "completions/mean_terminated_length": 647.4166870117188, "completions/min_length": 396.0, "completions/min_terminated_length": 396.0, "epoch": 0.7472862957937585, "frac_reward_zero_std": 0.0, "grad_norm": 0.08066590875387192, "kl": 0.0, "learning_rate": 1.2870945479641132e-07, "loss": -0.0004, "num_tokens": 67431643.0, "reward": 0.7708333730697632, "reward_std": 0.07144342362880707, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 2203 }, { "completion_length": 1957.7501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 4385.0, "completions/max_terminated_length": 4385.0, "completions/mean_length": 1957.75, "completions/mean_terminated_length": 1957.75, "completions/min_length": 596.0, "completions/min_terminated_length": 596.0, "epoch": 0.7476255088195387, "frac_reward_zero_std": 1.0, "grad_norm": 1.9429411679539044e-07, "kl": 0.0, "learning_rate": 1.2853692201518287e-07, "loss": 0.0, "num_tokens": 67466734.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2204 }, { "completion_length": 1120.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2456.0, "completions/max_terminated_length": 2456.0, "completions/mean_length": 1120.0833740234375, "completions/mean_terminated_length": 1120.0833740234375, "completions/min_length": 350.0, "completions/min_terminated_length": 350.0, "epoch": 0.7479647218453188, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.2836438923395445e-07, "loss": 0.0, "num_tokens": 67489601.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2205 }, { "completion_length": 652.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1728.0, "completions/max_terminated_length": 1728.0, "completions/mean_length": 652.4166870117188, "completions/mean_terminated_length": 652.4166870117188, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "epoch": 0.748303934871099, "frac_reward_zero_std": 0.5, "grad_norm": 0.04975380003452301, "kl": 0.0, "learning_rate": 1.28191856452726e-07, "loss": 0.0004, "num_tokens": 67505176.0, "reward": 0.7166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2206 }, { "completion_length": 2915.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 6209.0, "completions/max_terminated_length": 6209.0, "completions/mean_length": 2915.166748046875, "completions/mean_terminated_length": 2915.166748046875, "completions/min_length": 853.0, "completions/min_terminated_length": 853.0, "epoch": 0.7486431478968792, "frac_reward_zero_std": 0.5, "grad_norm": 0.06849915534257889, "kl": 0.0, "learning_rate": 1.2801932367149758e-07, "loss": -0.0, "num_tokens": 67553958.0, "reward": 0.7166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2207 }, { "completion_length": 448.25001525878906, "completions/clipped_ratio": 0.0, "completions/max_length": 590.0, "completions/max_terminated_length": 590.0, "completions/mean_length": 448.25, "completions/mean_terminated_length": 448.25, "completions/min_length": 379.0, "completions/min_terminated_length": 379.0, "epoch": 0.7489823609226595, "frac_reward_zero_std": 0.5, "grad_norm": 0.043897103518247604, "kl": 0.0, "learning_rate": 1.2784679089026913e-07, "loss": -0.0001, "num_tokens": 67572405.0, "reward": 0.7875000834465027, "reward_std": 0.03061862848699093, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 2208 }, { "completion_length": 947.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1726.0, "completions/max_terminated_length": 1726.0, "completions/mean_length": 947.6666870117188, "completions/mean_terminated_length": 947.6666870117188, "completions/min_length": 495.0, "completions/min_terminated_length": 495.0, "epoch": 0.7493215739484396, "frac_reward_zero_std": 0.5, "grad_norm": 0.06710971891880035, "kl": 0.0, "learning_rate": 1.276742581090407e-07, "loss": -0.0007, "num_tokens": 67596773.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2209 }, { "completion_length": 1049.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1774.0, "completions/max_terminated_length": 1774.0, "completions/mean_length": 1049.5833740234375, "completions/mean_terminated_length": 1049.5833740234375, "completions/min_length": 501.0, "completions/min_terminated_length": 501.0, "epoch": 0.7496607869742198, "frac_reward_zero_std": 0.5, "grad_norm": 0.07258152961730957, "kl": 0.0, "learning_rate": 1.275017253278123e-07, "loss": -0.0008, "num_tokens": 67620378.0, "reward": 1.1666667461395264, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2210 }, { "completion_length": 595.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1043.0, "completions/max_terminated_length": 1043.0, "completions/mean_length": 595.3333740234375, "completions/mean_terminated_length": 595.3333740234375, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 0.75, "frac_reward_zero_std": 0.0, "grad_norm": 0.09315574914216995, "kl": 0.0, "learning_rate": 1.2732919254658385e-07, "loss": 0.0001, "num_tokens": 67642936.0, "reward": 1.2166666984558105, "reward_std": 0.09246456623077393, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2211 }, { "completion_length": 925.5000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 2231.0, "completions/max_terminated_length": 2231.0, "completions/mean_length": 925.5, "completions/mean_terminated_length": 925.5, "completions/min_length": 414.0, "completions/min_terminated_length": 414.0, "epoch": 0.7503392130257802, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.2715665976535543e-07, "loss": 0.0, "num_tokens": 67665004.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2212 }, { "completion_length": 1262.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2459.0, "completions/max_terminated_length": 2459.0, "completions/mean_length": 1262.5833740234375, "completions/mean_terminated_length": 1262.5833740234375, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "epoch": 0.7506784260515604, "frac_reward_zero_std": 0.5, "grad_norm": 0.045680899173021317, "kl": 0.0, "learning_rate": 1.2698412698412698e-07, "loss": -0.0001, "num_tokens": 67691963.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2213 }, { "completion_length": 1103.4167175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 2378.0, "completions/max_terminated_length": 2378.0, "completions/mean_length": 1103.416748046875, "completions/mean_terminated_length": 1103.416748046875, "completions/min_length": 376.0, "completions/min_terminated_length": 376.0, "epoch": 0.7510176390773405, "frac_reward_zero_std": 0.5, "grad_norm": 0.29365622997283936, "kl": 0.0, "learning_rate": 1.2681159420289856e-07, "loss": -0.0084, "num_tokens": 67718302.0, "reward": 1.133333444595337, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.2806917726993561, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2214 }, { "completion_length": 1704.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4762.0, "completions/max_terminated_length": 4762.0, "completions/mean_length": 1704.5, "completions/mean_terminated_length": 1704.5, "completions/min_length": 495.0, "completions/min_terminated_length": 495.0, "epoch": 0.7513568521031208, "frac_reward_zero_std": 0.5, "grad_norm": 0.5076410174369812, "kl": 0.0, "learning_rate": 1.266390614216701e-07, "loss": 0.015, "num_tokens": 67750108.0, "reward": 1.0, "reward_std": 0.21908903121948242, "rewards/correctness_reward_func/mean": 0.699999988079071, "rewards/correctness_reward_func/std": 0.43064433336257935, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2215 }, { "completion_length": 381.75001525878906, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 381.75, "completions/mean_terminated_length": 381.75, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.751696065128901, "frac_reward_zero_std": 0.5, "grad_norm": 0.046513523906469345, "kl": 0.0, "learning_rate": 1.264665286404417e-07, "loss": 0.0001, "num_tokens": 67764949.0, "reward": 1.2333333492279053, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.9333333373069763, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2216 }, { "completion_length": 1751.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3322.0, "completions/max_terminated_length": 3322.0, "completions/mean_length": 1751.0833740234375, "completions/mean_terminated_length": 1751.0833740234375, "completions/min_length": 670.0, "completions/min_terminated_length": 670.0, "epoch": 0.7520352781546812, "frac_reward_zero_std": 0.5, "grad_norm": 0.44721847772598267, "kl": 0.0, "learning_rate": 1.2629399585921324e-07, "loss": -0.0065, "num_tokens": 67797620.0, "reward": 0.6499999761581421, "reward_std": 0.17606817185878754, "rewards/correctness_reward_func/mean": 0.3499999940395355, "rewards/correctness_reward_func/std": 0.4358898997306824, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2217 }, { "completion_length": 879.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1501.0, "completions/max_terminated_length": 1501.0, "completions/mean_length": 879.4166870117188, "completions/mean_terminated_length": 879.4166870117188, "completions/min_length": 474.0, "completions/min_terminated_length": 474.0, "epoch": 0.7523744911804613, "frac_reward_zero_std": 1.0, "grad_norm": 1.0248873394402835e-07, "kl": 0.0, "learning_rate": 1.2612146307798482e-07, "loss": 0.0, "num_tokens": 67818493.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2218 }, { "completion_length": 432.5833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/max_terminated_length": 538.0, "completions/mean_length": 432.5833435058594, "completions/mean_terminated_length": 432.5833435058594, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.7527137042062415, "frac_reward_zero_std": 0.5, "grad_norm": 0.0442735031247139, "kl": 0.0, "learning_rate": 1.2594893029675637e-07, "loss": -0.0, "num_tokens": 67833398.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2219 }, { "completion_length": 1006.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 1873.0, "completions/max_terminated_length": 1873.0, "completions/mean_length": 1006.0, "completions/mean_terminated_length": 1006.0, "completions/min_length": 521.0, "completions/min_terminated_length": 521.0, "epoch": 0.7530529172320217, "frac_reward_zero_std": 0.0, "grad_norm": 0.27989304065704346, "kl": 0.0, "learning_rate": 1.2577639751552793e-07, "loss": -0.0074, "num_tokens": 67860812.0, "reward": 1.1166666746139526, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.8166666030883789, "rewards/correctness_reward_func/std": 0.27579087018966675, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2220 }, { "completion_length": 642.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 975.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 642.75, "completions/mean_terminated_length": 642.75, "completions/min_length": 435.0, "completions/min_terminated_length": 435.0, "epoch": 0.7533921302578019, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.256038647342995e-07, "loss": 0.0, "num_tokens": 67882079.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2221 }, { "completion_length": 974.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1320.0, "completions/max_terminated_length": 1320.0, "completions/mean_length": 974.25, "completions/mean_terminated_length": 974.25, "completions/min_length": 616.0, "completions/min_terminated_length": 616.0, "epoch": 0.753731343283582, "frac_reward_zero_std": 0.5, "grad_norm": 0.41481757164001465, "kl": 0.0, "learning_rate": 1.2543133195307106e-07, "loss": 0.0015, "num_tokens": 67909118.0, "reward": 1.2166666984558105, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2222 }, { "completion_length": 685.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1056.0, "completions/max_terminated_length": 1056.0, "completions/mean_length": 685.0, "completions/mean_terminated_length": 685.0, "completions/min_length": 465.0, "completions/min_terminated_length": 465.0, "epoch": 0.7540705563093623, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.2525879917184264e-07, "loss": 0.0, "num_tokens": 67930616.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2223 }, { "completion_length": 678.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1114.0, "completions/max_terminated_length": 1114.0, "completions/mean_length": 678.5833740234375, "completions/mean_terminated_length": 678.5833740234375, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 0.7544097693351425, "frac_reward_zero_std": 0.5, "grad_norm": 0.05658742040395737, "kl": 0.0, "learning_rate": 1.250862663906142e-07, "loss": -0.0005, "num_tokens": 67946043.0, "reward": 0.7833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2224 }, { "completion_length": 2059.5001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 3468.0, "completions/max_terminated_length": 3468.0, "completions/mean_length": 2059.5, "completions/mean_terminated_length": 2059.5, "completions/min_length": 1058.0, "completions/min_terminated_length": 1058.0, "epoch": 0.7547489823609227, "frac_reward_zero_std": 0.5, "grad_norm": 0.5075283646583557, "kl": 0.0, "learning_rate": 1.2491373360938577e-07, "loss": 0.0095, "num_tokens": 67982211.0, "reward": 0.6333333253860474, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.3333333432674408, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2225 }, { "completion_length": 1437.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2281.0, "completions/max_terminated_length": 2281.0, "completions/mean_length": 1437.75, "completions/mean_terminated_length": 1437.75, "completions/min_length": 1085.0, "completions/min_terminated_length": 1085.0, "epoch": 0.7550881953867028, "frac_reward_zero_std": 0.5, "grad_norm": 0.5271453261375427, "kl": 0.0, "learning_rate": 1.2474120082815735e-07, "loss": -0.005, "num_tokens": 68011902.0, "reward": 1.133333444595337, "reward_std": 0.25819891691207886, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.3892494738101959, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2226 }, { "completion_length": 645.0, "completions/clipped_ratio": 0.0, "completions/max_length": 860.0, "completions/max_terminated_length": 860.0, "completions/mean_length": 645.0, "completions/mean_terminated_length": 645.0, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "epoch": 0.755427408412483, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.245686680469289e-07, "loss": 0.0, "num_tokens": 68034042.0, "reward": 0.30000001192092896, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2227 }, { "completion_length": 707.5833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1534.0, "completions/max_terminated_length": 1534.0, "completions/mean_length": 707.5833740234375, "completions/mean_terminated_length": 707.5833740234375, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 0.7557666214382632, "frac_reward_zero_std": 0.5, "grad_norm": 0.09808249026536942, "kl": 0.0, "learning_rate": 1.2439613526570048e-07, "loss": 0.0021, "num_tokens": 68056195.0, "reward": 1.25, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2228 }, { "completion_length": 1548.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3253.0, "completions/max_terminated_length": 3253.0, "completions/mean_length": 1548.75, "completions/mean_terminated_length": 1548.75, "completions/min_length": 652.0, "completions/min_terminated_length": 652.0, "epoch": 0.7561058344640434, "frac_reward_zero_std": 0.5, "grad_norm": 0.12493033707141876, "kl": 0.0, "learning_rate": 1.2422360248447204e-07, "loss": -0.0011, "num_tokens": 68085214.0, "reward": 1.2666666507720947, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2229 }, { "completion_length": 1187.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2849.0, "completions/max_terminated_length": 2849.0, "completions/mean_length": 1187.75, "completions/mean_terminated_length": 1187.75, "completions/min_length": 443.0, "completions/min_terminated_length": 443.0, "epoch": 0.7564450474898237, "frac_reward_zero_std": 0.0, "grad_norm": 0.41731247305870056, "kl": 0.0, "learning_rate": 1.2405106970324361e-07, "loss": -0.0043, "num_tokens": 68112139.0, "reward": 1.1500000953674316, "reward_std": 0.2557639479637146, "rewards/correctness_reward_func/mean": 0.8499999642372131, "rewards/correctness_reward_func/std": 0.28444525599479675, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2230 }, { "completion_length": 523.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 971.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 523.8333740234375, "completions/mean_terminated_length": 523.8333740234375, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.7567842605156038, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.2387853692201517e-07, "loss": 0.0, "num_tokens": 68129057.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2231 }, { "completion_length": 1117.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2888.0, "completions/max_terminated_length": 2888.0, "completions/mean_length": 1117.166748046875, "completions/mean_terminated_length": 1117.166748046875, "completions/min_length": 407.0, "completions/min_terminated_length": 407.0, "epoch": 0.757123473541384, "frac_reward_zero_std": 1.0, "grad_norm": 1.372334281768417e-07, "kl": 0.0, "learning_rate": 1.2370600414078675e-07, "loss": 0.0, "num_tokens": 68155735.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2232 }, { "completion_length": 643.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1214.0, "completions/max_terminated_length": 1214.0, "completions/mean_length": 643.75, "completions/mean_terminated_length": 643.75, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 0.7574626865671642, "frac_reward_zero_std": 0.5, "grad_norm": 0.28315895795822144, "kl": 0.0, "learning_rate": 1.2353347135955833e-07, "loss": -0.0027, "num_tokens": 68175748.0, "reward": 1.2000000476837158, "reward_std": 0.20000001788139343, "rewards/correctness_reward_func/mean": 0.9000000357627869, "rewards/correctness_reward_func/std": 0.28919950127601624, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2233 }, { "completion_length": 763.5000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1461.0, "completions/max_terminated_length": 1461.0, "completions/mean_length": 763.5, "completions/mean_terminated_length": 763.5, "completions/min_length": 353.0, "completions/min_terminated_length": 353.0, "epoch": 0.7578018995929444, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.2336093857832988e-07, "loss": 0.0, "num_tokens": 68200390.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2234 }, { "completion_length": 1153.5833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 3646.0, "completions/max_terminated_length": 3646.0, "completions/mean_length": 1153.5833740234375, "completions/mean_terminated_length": 1153.5833740234375, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.7581411126187245, "frac_reward_zero_std": 0.5, "grad_norm": 0.06398869305849075, "kl": 0.0, "learning_rate": 1.2318840579710146e-07, "loss": -0.0, "num_tokens": 68225453.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2235 }, { "completion_length": 739.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1163.0, "completions/max_terminated_length": 1163.0, "completions/mean_length": 739.8333740234375, "completions/mean_terminated_length": 739.8333740234375, "completions/min_length": 465.0, "completions/min_terminated_length": 465.0, "epoch": 0.7584803256445047, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.23015873015873e-07, "loss": 0.0, "num_tokens": 68246889.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2236 }, { "completion_length": 1103.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2952.0, "completions/max_terminated_length": 2952.0, "completions/mean_length": 1103.916748046875, "completions/mean_terminated_length": 1103.916748046875, "completions/min_length": 624.0, "completions/min_terminated_length": 624.0, "epoch": 0.758819538670285, "frac_reward_zero_std": 0.5, "grad_norm": 0.3186628520488739, "kl": 0.0, "learning_rate": 1.2284334023464456e-07, "loss": -0.001, "num_tokens": 68274140.0, "reward": 0.8666666746139526, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.5666666626930237, "rewards/correctness_reward_func/std": 0.5033223032951355, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2237 }, { "completion_length": 769.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1141.0, "completions/max_terminated_length": 1141.0, "completions/mean_length": 769.6666870117188, "completions/mean_terminated_length": 769.6666870117188, "completions/min_length": 390.0, "completions/min_terminated_length": 390.0, "epoch": 0.7591587516960652, "frac_reward_zero_std": 0.0, "grad_norm": 0.2749316394329071, "kl": 0.0, "learning_rate": 1.2267080745341614e-07, "loss": -0.0029, "num_tokens": 68291806.0, "reward": 1.066666841506958, "reward_std": 0.21493908762931824, "rewards/correctness_reward_func/mean": 0.7666667103767395, "rewards/correctness_reward_func/std": 0.25346091389656067, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2238 }, { "completion_length": 741.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1332.0, "completions/max_terminated_length": 1332.0, "completions/mean_length": 741.4166870117188, "completions/mean_terminated_length": 741.4166870117188, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 0.7594979647218453, "frac_reward_zero_std": 1.0, "grad_norm": 2.1559623064604239e-07, "kl": 0.0, "learning_rate": 1.224982746721877e-07, "loss": 0.0, "num_tokens": 68312307.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2239 }, { "completion_length": 744.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1494.0, "completions/max_terminated_length": 1494.0, "completions/mean_length": 744.3333740234375, "completions/mean_terminated_length": 744.3333740234375, "completions/min_length": 371.0, "completions/min_terminated_length": 371.0, "epoch": 0.7598371777476255, "frac_reward_zero_std": 0.5, "grad_norm": 0.35311397910118103, "kl": 0.0, "learning_rate": 1.2232574189095927e-07, "loss": -0.0041, "num_tokens": 68333611.0, "reward": 1.2166666984558105, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2240 }, { "completion_length": 902.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1442.0, "completions/max_terminated_length": 1442.0, "completions/mean_length": 902.4166870117188, "completions/mean_terminated_length": 902.4166870117188, "completions/min_length": 440.0, "completions/min_terminated_length": 440.0, "epoch": 0.7601763907734057, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.2215320910973085e-07, "loss": 0.0, "num_tokens": 68353176.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2241 }, { "completion_length": 916.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1635.0, "completions/max_terminated_length": 1635.0, "completions/mean_length": 916.3333740234375, "completions/mean_terminated_length": 916.3333740234375, "completions/min_length": 435.0, "completions/min_terminated_length": 435.0, "epoch": 0.7605156037991859, "frac_reward_zero_std": 1.0, "grad_norm": 9.33147319415184e-08, "kl": 0.0, "learning_rate": 1.219806763285024e-07, "loss": 0.0, "num_tokens": 68377732.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2242 }, { "completion_length": 557.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1113.0, "completions/max_terminated_length": 1113.0, "completions/mean_length": 557.6666870117188, "completions/mean_terminated_length": 557.6666870117188, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.760854816824966, "frac_reward_zero_std": 0.5, "grad_norm": 0.05936620756983757, "kl": 0.0, "learning_rate": 1.2180814354727399e-07, "loss": -0.0002, "num_tokens": 68399118.0, "reward": 1.2166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298573225736618, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2243 }, { "completion_length": 1568.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3723.0, "completions/max_terminated_length": 3723.0, "completions/mean_length": 1568.0, "completions/mean_terminated_length": 1568.0, "completions/min_length": 500.0, "completions/min_terminated_length": 500.0, "epoch": 0.7611940298507462, "frac_reward_zero_std": 1.0, "grad_norm": 1.2730076548450597e-07, "kl": 0.0, "learning_rate": 1.2163561076604554e-07, "loss": 0.0, "num_tokens": 68434626.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2244 }, { "completion_length": 1412.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 3906.0, "completions/max_terminated_length": 3906.0, "completions/mean_length": 1412.25, "completions/mean_terminated_length": 1412.25, "completions/min_length": 619.0, "completions/min_terminated_length": 619.0, "epoch": 0.7615332428765265, "frac_reward_zero_std": 1.0, "grad_norm": 1.701313436797136e-07, "kl": 0.0, "learning_rate": 1.2146307798481712e-07, "loss": 0.0, "num_tokens": 68462613.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2245 }, { "completion_length": 731.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1307.0, "completions/max_terminated_length": 1307.0, "completions/mean_length": 731.3333740234375, "completions/mean_terminated_length": 731.3333740234375, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "epoch": 0.7618724559023067, "frac_reward_zero_std": 0.5, "grad_norm": 0.05525204911828041, "kl": 0.0, "learning_rate": 1.2129054520358867e-07, "loss": 0.0001, "num_tokens": 68483995.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2246 }, { "completion_length": 648.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 984.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 648.4166870117188, "completions/mean_terminated_length": 648.4166870117188, "completions/min_length": 485.0, "completions/min_terminated_length": 485.0, "epoch": 0.7622116689280869, "frac_reward_zero_std": 1.0, "grad_norm": 1.2362978907276556e-07, "kl": 0.0, "learning_rate": 1.2111801242236025e-07, "loss": 0.0, "num_tokens": 68506206.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2247 }, { "completion_length": 1383.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2568.0, "completions/max_terminated_length": 2568.0, "completions/mean_length": 1383.0, "completions/mean_terminated_length": 1383.0, "completions/min_length": 636.0, "completions/min_terminated_length": 636.0, "epoch": 0.762550881953867, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.209454796411318e-07, "loss": 0.0, "num_tokens": 68534142.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2248 }, { "completion_length": 587.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1061.0, "completions/max_terminated_length": 1061.0, "completions/mean_length": 587.0, "completions/mean_terminated_length": 587.0, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 0.7628900949796472, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.2077294685990338e-07, "loss": 0.0, "num_tokens": 68556024.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2249 }, { "completion_length": 1903.5001220703125, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 3306.0, "completions/mean_length": 2452.58349609375, "completions/mean_terminated_length": 2076.54541015625, "completions/min_length": 1472.0, "completions/min_terminated_length": 1472.0, "epoch": 0.7632293080054274, "frac_reward_zero_std": 0.5, "grad_norm": 0.2177281230688095, "kl": NaN, "learning_rate": 1.2060041407867494e-07, "loss": -0.0212, "num_tokens": 68594052.0, "reward": 0.7041666507720947, "reward_std": 0.23474274575710297, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.5149286985397339, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 2250 }, { "completion_length": 544.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 904.0, "completions/max_terminated_length": 904.0, "completions/mean_length": 544.3333740234375, "completions/mean_terminated_length": 544.3333740234375, "completions/min_length": 394.0, "completions/min_terminated_length": 394.0, "epoch": 0.7635685210312076, "frac_reward_zero_std": 0.5, "grad_norm": 0.05169430375099182, "kl": 0.0, "learning_rate": 1.2042788129744651e-07, "loss": 0.0001, "num_tokens": 68611924.0, "reward": 0.7833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2251 }, { "completion_length": 1107.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2079.0, "completions/max_terminated_length": 2079.0, "completions/mean_length": 1107.166748046875, "completions/mean_terminated_length": 1107.166748046875, "completions/min_length": 783.0, "completions/min_terminated_length": 783.0, "epoch": 0.7639077340569878, "frac_reward_zero_std": 0.0, "grad_norm": 0.33403638005256653, "kl": 0.0, "learning_rate": 1.2025534851621807e-07, "loss": -0.0025, "num_tokens": 68637246.0, "reward": 1.1166666746139526, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.8166666030883789, "rewards/correctness_reward_func/std": 0.27579087018966675, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2252 }, { "completion_length": 1040.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1725.0, "completions/max_terminated_length": 1725.0, "completions/mean_length": 1040.3333740234375, "completions/mean_terminated_length": 1040.3333740234375, "completions/min_length": 549.0, "completions/min_terminated_length": 549.0, "epoch": 0.764246947082768, "frac_reward_zero_std": 0.0, "grad_norm": 0.10644125938415527, "kl": 0.0, "learning_rate": 1.2008281573498965e-07, "loss": 0.002, "num_tokens": 68657830.0, "reward": 1.2166666984558105, "reward_std": 0.09246458113193512, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298573225736618, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2253 }, { "completion_length": 756.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1071.0, "completions/max_terminated_length": 1071.0, "completions/mean_length": 756.9166870117188, "completions/mean_terminated_length": 756.9166870117188, "completions/min_length": 505.0, "completions/min_terminated_length": 505.0, "epoch": 0.7645861601085482, "frac_reward_zero_std": 1.0, "grad_norm": 1.4671179826564185e-07, "kl": 0.0, "learning_rate": 1.199102829537612e-07, "loss": 0.0, "num_tokens": 68679489.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2254 }, { "completion_length": 1129.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 1552.0, "completions/max_terminated_length": 1552.0, "completions/mean_length": 1129.0, "completions/mean_terminated_length": 1129.0, "completions/min_length": 476.0, "completions/min_terminated_length": 476.0, "epoch": 0.7649253731343284, "frac_reward_zero_std": 0.5, "grad_norm": 0.08349355310201645, "kl": 0.0, "learning_rate": 1.1973775017253278e-07, "loss": 0.0007, "num_tokens": 68710989.0, "reward": 1.1666667461395264, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2255 }, { "completion_length": 622.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 806.0, "completions/max_terminated_length": 806.0, "completions/mean_length": 622.75, "completions/mean_terminated_length": 622.75, "completions/min_length": 487.0, "completions/min_terminated_length": 487.0, "epoch": 0.7652645861601085, "frac_reward_zero_std": 0.5, "grad_norm": 0.08113700896501541, "kl": 0.0, "learning_rate": 1.1956521739130436e-07, "loss": 0.0003, "num_tokens": 68729616.0, "reward": 0.75, "reward_std": 0.05477222427725792, "rewards/correctness_reward_func/mean": 0.44999995827674866, "rewards/correctness_reward_func/std": 0.47577688097953796, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2256 }, { "completion_length": 851.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1323.0, "completions/max_terminated_length": 1323.0, "completions/mean_length": 851.75, "completions/mean_terminated_length": 851.75, "completions/min_length": 446.0, "completions/min_terminated_length": 446.0, "epoch": 0.7656037991858887, "frac_reward_zero_std": 0.5, "grad_norm": 0.07393565773963928, "kl": 0.0, "learning_rate": 1.193926846100759e-07, "loss": 0.0004, "num_tokens": 68751045.0, "reward": 1.1500000953674316, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.8500000834465027, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2257 }, { "completion_length": 2248.7500610351562, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5987.0, "completions/mean_length": 2797.83349609375, "completions/mean_terminated_length": 2453.181884765625, "completions/min_length": 644.0, "completions/min_terminated_length": 644.0, "epoch": 0.7659430122116689, "frac_reward_zero_std": 0.5, "grad_norm": 0.13653559982776642, "kl": NaN, "learning_rate": 1.192201518288475e-07, "loss": -0.0138, "num_tokens": 68790606.0, "reward": 0.6750000715255737, "reward_std": 0.06123724579811096, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 2258 }, { "completion_length": 653.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1057.0, "completions/max_terminated_length": 1057.0, "completions/mean_length": 653.1666870117188, "completions/mean_terminated_length": 653.1666870117188, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "epoch": 0.7662822252374492, "frac_reward_zero_std": 0.5, "grad_norm": 0.06496720761060715, "kl": 0.0, "learning_rate": 1.1904761904761903e-07, "loss": -0.0004, "num_tokens": 68809964.0, "reward": 1.133333444595337, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2259 }, { "completion_length": 1452.416748046875, "completions/clipped_ratio": 0.41666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 4642.0, "completions/mean_length": 4197.83349609375, "completions/mean_terminated_length": 2489.857177734375, "completions/min_length": 1009.0, "completions/min_terminated_length": 1009.0, "epoch": 0.7666214382632293, "frac_reward_zero_std": 0.0, "grad_norm": 0.7320407629013062, "kl": NaN, "learning_rate": 1.1887508626639061e-07, "loss": 0.0042, "num_tokens": 68839531.0, "reward": 0.5083333253860474, "reward_std": 0.31943613290786743, "rewards/correctness_reward_func/mean": 0.3333333432674408, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.17499999701976776, "rewards/format_reward_func/std": 0.15447859466075897, "step": 2260 }, { "completion_length": 631.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 839.0, "completions/max_terminated_length": 839.0, "completions/mean_length": 631.8333740234375, "completions/mean_terminated_length": 631.8333740234375, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "epoch": 0.7669606512890095, "frac_reward_zero_std": 0.5, "grad_norm": 0.08150289952754974, "kl": 0.0, "learning_rate": 1.1870255348516218e-07, "loss": -0.0001, "num_tokens": 68857613.0, "reward": 1.1500000953674316, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.8500000834465027, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2261 }, { "completion_length": 2232.416748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4325.0, "completions/mean_length": 2781.5, "completions/mean_terminated_length": 2435.36376953125, "completions/min_length": 788.0, "completions/min_terminated_length": 788.0, "epoch": 0.7672998643147897, "frac_reward_zero_std": 0.5, "grad_norm": 0.8333950638771057, "kl": NaN, "learning_rate": 1.1853002070393374e-07, "loss": -0.0458, "num_tokens": 68897536.0, "reward": 0.8416666984558105, "reward_std": 0.2835783362388611, "rewards/correctness_reward_func/mean": 0.5666666626930237, "rewards/correctness_reward_func/std": 0.42497774958610535, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 2262 }, { "completion_length": 544.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1076.0, "completions/max_terminated_length": 1076.0, "completions/mean_length": 544.9166870117188, "completions/mean_terminated_length": 544.9166870117188, "completions/min_length": 363.0, "completions/min_terminated_length": 363.0, "epoch": 0.7676390773405699, "frac_reward_zero_std": 1.0, "grad_norm": 8.027829068169012e-08, "kl": 0.0, "learning_rate": 1.1835748792270531e-07, "loss": 0.0, "num_tokens": 68917161.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2263 }, { "completion_length": 552.0, "completions/clipped_ratio": 0.0, "completions/max_length": 944.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 552.0, "completions/mean_terminated_length": 552.0, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.76797829036635, "frac_reward_zero_std": 0.5, "grad_norm": 0.04227226972579956, "kl": 0.0, "learning_rate": 1.1818495514147687e-07, "loss": -0.0002, "num_tokens": 68937279.0, "reward": 1.2874999046325684, "reward_std": 0.03061862848699093, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 2264 }, { "completion_length": 1165.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 3144.0, "completions/max_terminated_length": 3144.0, "completions/mean_length": 1165.25, "completions/mean_terminated_length": 1165.25, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.7683175033921302, "frac_reward_zero_std": 0.5, "grad_norm": 0.12164901942014694, "kl": 0.0, "learning_rate": 1.1801242236024844e-07, "loss": 0.0018, "num_tokens": 68961942.0, "reward": 1.25, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2265 }, { "completion_length": 678.0000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1037.0, "completions/max_terminated_length": 1037.0, "completions/mean_length": 678.0, "completions/mean_terminated_length": 678.0, "completions/min_length": 378.0, "completions/min_terminated_length": 378.0, "epoch": 0.7686567164179104, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.1783988957902e-07, "loss": 0.0, "num_tokens": 68982078.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2266 }, { "completion_length": 522.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 814.0, "completions/max_terminated_length": 814.0, "completions/mean_length": 522.3333740234375, "completions/mean_terminated_length": 522.3333740234375, "completions/min_length": 353.0, "completions/min_terminated_length": 353.0, "epoch": 0.7689959294436907, "frac_reward_zero_std": 0.5, "grad_norm": 0.027568982914090157, "kl": 0.0, "learning_rate": 1.1766735679779157e-07, "loss": -0.0001, "num_tokens": 68998084.0, "reward": 1.0875000953674316, "reward_std": 0.030618607997894287, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 2267 }, { "completion_length": 2185.416748046875, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 5854.0, "completions/mean_length": 3283.58349609375, "completions/mean_terminated_length": 2622.5, "completions/min_length": 345.0, "completions/min_terminated_length": 345.0, "epoch": 0.7693351424694709, "frac_reward_zero_std": 0.0, "grad_norm": 0.4235885739326477, "kl": NaN, "learning_rate": 1.1749482401656315e-07, "loss": -0.0388, "num_tokens": 69040251.0, "reward": 0.5166667699813843, "reward_std": 0.2840188145637512, "rewards/correctness_reward_func/mean": 0.2666666805744171, "rewards/correctness_reward_func/std": 0.393892765045166, "rewards/format_reward_func/mean": 0.2500000298023224, "rewards/format_reward_func/std": 0.11677484214305878, "step": 2268 }, { "completion_length": 1102.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2386.0, "completions/max_terminated_length": 2386.0, "completions/mean_length": 1102.166748046875, "completions/mean_terminated_length": 1102.166748046875, "completions/min_length": 494.0, "completions/min_terminated_length": 494.0, "epoch": 0.769674355495251, "frac_reward_zero_std": 0.0, "grad_norm": 0.5189723968505859, "kl": 0.0, "learning_rate": 1.1732229123533472e-07, "loss": 0.019, "num_tokens": 69061505.0, "reward": 1.0333333015441895, "reward_std": 0.3146860599517822, "rewards/correctness_reward_func/mean": 0.7333333492279053, "rewards/correctness_reward_func/std": 0.44585633277893066, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2269 }, { "completion_length": 504.1666717529297, "completions/clipped_ratio": 0.0, "completions/max_length": 1399.0, "completions/max_terminated_length": 1399.0, "completions/mean_length": 504.16668701171875, "completions/mean_terminated_length": 504.16668701171875, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.7700135685210312, "frac_reward_zero_std": 1.0, "grad_norm": 1.453059184086669e-07, "kl": 0.0, "learning_rate": 1.1714975845410628e-07, "loss": 0.0, "num_tokens": 69075691.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2270 }, { "completion_length": 844.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1578.0, "completions/max_terminated_length": 1578.0, "completions/mean_length": 844.6666870117188, "completions/mean_terminated_length": 844.6666870117188, "completions/min_length": 581.0, "completions/min_terminated_length": 581.0, "epoch": 0.7703527815468114, "frac_reward_zero_std": 1.0, "grad_norm": 9.218145891054519e-08, "kl": 0.0, "learning_rate": 1.1697722567287784e-07, "loss": 0.0, "num_tokens": 69096393.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2271 }, { "completion_length": 730.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1243.0, "completions/max_terminated_length": 1243.0, "completions/mean_length": 730.25, "completions/mean_terminated_length": 730.25, "completions/min_length": 449.0, "completions/min_terminated_length": 449.0, "epoch": 0.7706919945725916, "frac_reward_zero_std": 0.5, "grad_norm": 0.07292886823415756, "kl": 0.0, "learning_rate": 1.168046928916494e-07, "loss": -0.0001, "num_tokens": 69118530.0, "reward": 1.1500000953674316, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.8500000834465027, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2272 }, { "completion_length": 697.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1449.0, "completions/max_terminated_length": 1449.0, "completions/mean_length": 697.5, "completions/mean_terminated_length": 697.5, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.7710312075983717, "frac_reward_zero_std": 0.0, "grad_norm": 0.2519453167915344, "kl": 0.0, "learning_rate": 1.1663216011042097e-07, "loss": 0.0042, "num_tokens": 69140148.0, "reward": 1.066666841506958, "reward_std": 0.21493908762931824, "rewards/correctness_reward_func/mean": 0.7666667103767395, "rewards/correctness_reward_func/std": 0.25346091389656067, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2273 }, { "completion_length": 593.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 996.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 593.6666870117188, "completions/mean_terminated_length": 593.6666870117188, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 0.771370420624152, "frac_reward_zero_std": 1.0, "grad_norm": 1.0999101363040609e-07, "kl": 0.0, "learning_rate": 1.1645962732919253e-07, "loss": 0.0, "num_tokens": 69158990.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2274 }, { "completion_length": 927.4167175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 2278.0, "completions/max_terminated_length": 2278.0, "completions/mean_length": 927.4166870117188, "completions/mean_terminated_length": 927.4166870117188, "completions/min_length": 509.0, "completions/min_terminated_length": 509.0, "epoch": 0.7717096336499322, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.162870945479641e-07, "loss": 0.0, "num_tokens": 69181429.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2275 }, { "completion_length": 1064.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2108.0, "completions/max_terminated_length": 2108.0, "completions/mean_length": 1064.416748046875, "completions/mean_terminated_length": 1064.416748046875, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "epoch": 0.7720488466757124, "frac_reward_zero_std": 0.5, "grad_norm": 0.530232310295105, "kl": 0.0, "learning_rate": 1.1611456176673568e-07, "loss": 0.0046, "num_tokens": 69208236.0, "reward": 1.1166667938232422, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.8166666030883789, "rewards/correctness_reward_func/std": 0.27579087018966675, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2276 }, { "completion_length": 1072.5833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1823.0, "completions/max_terminated_length": 1823.0, "completions/mean_length": 1072.5833740234375, "completions/mean_terminated_length": 1072.5833740234375, "completions/min_length": 459.0, "completions/min_terminated_length": 459.0, "epoch": 0.7723880597014925, "frac_reward_zero_std": 0.5, "grad_norm": 0.05342491716146469, "kl": 0.0, "learning_rate": 1.1594202898550725e-07, "loss": 0.0004, "num_tokens": 69235387.0, "reward": 0.7166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2277 }, { "completion_length": 1687.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3089.0, "completions/max_terminated_length": 3089.0, "completions/mean_length": 1687.5833740234375, "completions/mean_terminated_length": 1687.5833740234375, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "epoch": 0.7727272727272727, "frac_reward_zero_std": 0.5, "grad_norm": 0.7110770344734192, "kl": 0.0, "learning_rate": 1.1576949620427881e-07, "loss": 0.0103, "num_tokens": 69267452.0, "reward": 1.0166666507720947, "reward_std": 0.24013887345790863, "rewards/correctness_reward_func/mean": 0.7166666984558105, "rewards/correctness_reward_func/std": 0.4386618733406067, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2278 }, { "completion_length": 760.5833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1255.0, "completions/max_terminated_length": 1255.0, "completions/mean_length": 760.5833740234375, "completions/mean_terminated_length": 760.5833740234375, "completions/min_length": 421.0, "completions/min_terminated_length": 421.0, "epoch": 0.7730664857530529, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.1559696342305038e-07, "loss": 0.0, "num_tokens": 69290571.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2279 }, { "completion_length": 852.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3208.0, "completions/max_terminated_length": 3208.0, "completions/mean_length": 852.5, "completions/mean_terminated_length": 852.5, "completions/min_length": 516.0, "completions/min_terminated_length": 516.0, "epoch": 0.7734056987788331, "frac_reward_zero_std": 0.5, "grad_norm": 0.07909195870161057, "kl": 0.0, "learning_rate": 1.1542443064182194e-07, "loss": 0.0001, "num_tokens": 69312711.0, "reward": 1.2666666507720947, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2280 }, { "completion_length": 1729.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2583.0, "completions/max_terminated_length": 2583.0, "completions/mean_length": 1729.3333740234375, "completions/mean_terminated_length": 1729.3333740234375, "completions/min_length": 953.0, "completions/min_terminated_length": 953.0, "epoch": 0.7737449118046132, "frac_reward_zero_std": 0.5, "grad_norm": 0.10752316564321518, "kl": 0.0, "learning_rate": 1.1525189786059351e-07, "loss": -0.0009, "num_tokens": 69346063.0, "reward": 0.75, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.45000001788139343, "rewards/correctness_reward_func/std": 0.47577688097953796, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2281 }, { "completion_length": 433.1666717529297, "completions/clipped_ratio": 0.0, "completions/max_length": 633.0, "completions/max_terminated_length": 633.0, "completions/mean_length": 433.16668701171875, "completions/mean_terminated_length": 433.16668701171875, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 0.7740841248303935, "frac_reward_zero_std": 0.0, "grad_norm": 0.5329289436340332, "kl": 0.0, "learning_rate": 1.1507936507936506e-07, "loss": -0.0023, "num_tokens": 69362055.0, "reward": 0.949999988079071, "reward_std": 0.29902368783950806, "rewards/correctness_reward_func/mean": 0.6500000357627869, "rewards/correctness_reward_func/std": 0.4833594858646393, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2282 }, { "completion_length": 1366.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2393.0, "completions/max_terminated_length": 2393.0, "completions/mean_length": 1366.75, "completions/mean_terminated_length": 1366.75, "completions/min_length": 769.0, "completions/min_terminated_length": 769.0, "epoch": 0.7744233378561737, "frac_reward_zero_std": 0.0, "grad_norm": 0.3162420392036438, "kl": 0.0, "learning_rate": 1.1490683229813663e-07, "loss": 0.0047, "num_tokens": 69391980.0, "reward": 1.066666841506958, "reward_std": 0.21493908762931824, "rewards/correctness_reward_func/mean": 0.7666667103767395, "rewards/correctness_reward_func/std": 0.25346091389656067, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2283 }, { "completion_length": 1431.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3796.0, "completions/max_terminated_length": 3796.0, "completions/mean_length": 1431.5833740234375, "completions/mean_terminated_length": 1431.5833740234375, "completions/min_length": 463.0, "completions/min_terminated_length": 463.0, "epoch": 0.7747625508819539, "frac_reward_zero_std": 0.5, "grad_norm": 0.43762558698654175, "kl": 0.0, "learning_rate": 1.1473429951690821e-07, "loss": -0.0153, "num_tokens": 69419143.0, "reward": 0.7666667699813843, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.46666669845581055, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2284 }, { "completion_length": 1192.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3115.0, "completions/max_terminated_length": 3115.0, "completions/mean_length": 1192.666748046875, "completions/mean_terminated_length": 1192.666748046875, "completions/min_length": 560.0, "completions/min_terminated_length": 560.0, "epoch": 0.7751017639077341, "frac_reward_zero_std": 0.5, "grad_norm": 0.687868058681488, "kl": 0.0, "learning_rate": 1.1456176673567977e-07, "loss": 0.0223, "num_tokens": 69443745.0, "reward": 0.6333333253860474, "reward_std": 0.25819891691207886, "rewards/correctness_reward_func/mean": 0.3333333432674408, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2285 }, { "completion_length": 1606.9167175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 4151.0, "completions/max_terminated_length": 4151.0, "completions/mean_length": 1606.916748046875, "completions/mean_terminated_length": 1606.916748046875, "completions/min_length": 593.0, "completions/min_terminated_length": 593.0, "epoch": 0.7754409769335142, "frac_reward_zero_std": 0.5, "grad_norm": 0.6134998798370361, "kl": 0.0, "learning_rate": 1.1438923395445134e-07, "loss": 0.0173, "num_tokens": 69474146.0, "reward": 1.149999976158142, "reward_std": 0.17606817185878754, "rewards/correctness_reward_func/mean": 0.8499999642372131, "rewards/correctness_reward_func/std": 0.28444525599479675, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2286 }, { "completion_length": 1423.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3175.0, "completions/max_terminated_length": 3175.0, "completions/mean_length": 1423.416748046875, "completions/mean_terminated_length": 1423.416748046875, "completions/min_length": 372.0, "completions/min_terminated_length": 372.0, "epoch": 0.7757801899592944, "frac_reward_zero_std": 0.5, "grad_norm": 0.4118131697177887, "kl": 0.0, "learning_rate": 1.142167011732229e-07, "loss": -0.0012, "num_tokens": 69502507.0, "reward": 0.36666667461395264, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.06666667014360428, "rewards/correctness_reward_func/std": 0.2309401035308838, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2287 }, { "completion_length": 1093.0833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 2504.0, "completions/max_terminated_length": 2504.0, "completions/mean_length": 1093.0833740234375, "completions/mean_terminated_length": 1093.0833740234375, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "epoch": 0.7761194029850746, "frac_reward_zero_std": 0.5, "grad_norm": 0.08393443375825882, "kl": 0.0, "learning_rate": 1.1404416839199447e-07, "loss": -0.0005, "num_tokens": 69530234.0, "reward": 1.133333444595337, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2288 }, { "completion_length": 1879.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4679.0, "completions/max_terminated_length": 4679.0, "completions/mean_length": 1879.5833740234375, "completions/mean_terminated_length": 1879.5833740234375, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "epoch": 0.7764586160108549, "frac_reward_zero_std": 0.5, "grad_norm": 0.41416317224502563, "kl": 0.0, "learning_rate": 1.1387163561076604e-07, "loss": -0.0022, "num_tokens": 69564459.0, "reward": 0.7041666507720947, "reward_std": 0.23474276065826416, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.5149286985397339, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 2289 }, { "completion_length": 846.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 999.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 846.25, "completions/mean_terminated_length": 846.25, "completions/min_length": 663.0, "completions/min_terminated_length": 663.0, "epoch": 0.776797829036635, "frac_reward_zero_std": 0.5, "grad_norm": 0.10412339866161346, "kl": 0.0, "learning_rate": 1.136991028295376e-07, "loss": -0.0004, "num_tokens": 69584628.0, "reward": 0.75, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.44999995827674866, "rewards/correctness_reward_func/std": 0.47577688097953796, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2290 }, { "completion_length": 947.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1583.0, "completions/max_terminated_length": 1583.0, "completions/mean_length": 947.75, "completions/mean_terminated_length": 947.75, "completions/min_length": 451.0, "completions/min_terminated_length": 451.0, "epoch": 0.7771370420624152, "frac_reward_zero_std": 0.0, "grad_norm": 0.12465295195579529, "kl": 0.0, "learning_rate": 1.1352657004830918e-07, "loss": -0.0021, "num_tokens": 69605067.0, "reward": 1.2333333492279053, "reward_std": 0.10327950119972229, "rewards/correctness_reward_func/mean": 0.9333333373069763, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2291 }, { "completion_length": 1234.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 1705.0, "completions/max_terminated_length": 1705.0, "completions/mean_length": 1234.25, "completions/mean_terminated_length": 1234.25, "completions/min_length": 698.0, "completions/min_terminated_length": 698.0, "epoch": 0.7774762550881954, "frac_reward_zero_std": 0.5, "grad_norm": 0.3546290397644043, "kl": 0.0, "learning_rate": 1.1335403726708075e-07, "loss": -0.0037, "num_tokens": 69633534.0, "reward": 0.4333333671092987, "reward_std": 0.20655912160873413, "rewards/correctness_reward_func/mean": 0.13333334028720856, "rewards/correctness_reward_func/std": 0.31139957904815674, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2292 }, { "completion_length": 885.0833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1735.0, "completions/max_terminated_length": 1735.0, "completions/mean_length": 885.0833740234375, "completions/mean_terminated_length": 885.0833740234375, "completions/min_length": 506.0, "completions/min_terminated_length": 506.0, "epoch": 0.7778154681139756, "frac_reward_zero_std": 0.5, "grad_norm": 0.0517500638961792, "kl": 0.0, "learning_rate": 1.131815044858523e-07, "loss": 0.0003, "num_tokens": 69659035.0, "reward": 1.2166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2293 }, { "completion_length": 1338.4167175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 3024.0, "completions/max_terminated_length": 3024.0, "completions/mean_length": 1338.416748046875, "completions/mean_terminated_length": 1338.416748046875, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.7781546811397557, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.1300897170462387e-07, "loss": 0.0, "num_tokens": 69686322.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2294 }, { "completion_length": 2126.0834350585938, "completions/clipped_ratio": 0.0, "completions/max_length": 4981.0, "completions/max_terminated_length": 4981.0, "completions/mean_length": 2126.08349609375, "completions/mean_terminated_length": 2126.08349609375, "completions/min_length": 676.0, "completions/min_terminated_length": 676.0, "epoch": 0.7784938941655359, "frac_reward_zero_std": 1.0, "grad_norm": 1.7061682910934906e-07, "kl": 0.0, "learning_rate": 1.1283643892339544e-07, "loss": 0.0, "num_tokens": 69725305.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2295 }, { "completion_length": 2184.0000610351562, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5652.0, "completions/mean_length": 2733.08349609375, "completions/mean_terminated_length": 2382.54541015625, "completions/min_length": 828.0, "completions/min_terminated_length": 828.0, "epoch": 0.7788331071913162, "frac_reward_zero_std": 0.5, "grad_norm": 0.7152767777442932, "kl": NaN, "learning_rate": 1.12663906142167e-07, "loss": -0.0415, "num_tokens": 69764491.0, "reward": 0.7583334445953369, "reward_std": 0.22453653812408447, "rewards/correctness_reward_func/mean": 0.4833333194255829, "rewards/correctness_reward_func/std": 0.43029236793518066, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 2296 }, { "completion_length": 780.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1523.0, "completions/max_terminated_length": 1523.0, "completions/mean_length": 780.5833740234375, "completions/mean_terminated_length": 780.5833740234375, "completions/min_length": 439.0, "completions/min_terminated_length": 439.0, "epoch": 0.7791723202170964, "frac_reward_zero_std": 0.5, "grad_norm": 0.06526831537485123, "kl": 0.0, "learning_rate": 1.1249137336093857e-07, "loss": -0.0, "num_tokens": 69785192.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2297 }, { "completion_length": 838.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1652.0, "completions/max_terminated_length": 1652.0, "completions/mean_length": 838.75, "completions/mean_terminated_length": 838.75, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.7795115332428765, "frac_reward_zero_std": 0.0, "grad_norm": 0.1147807165980339, "kl": 0.0, "learning_rate": 1.1231884057971013e-07, "loss": 0.0014, "num_tokens": 69808271.0, "reward": 1.2666666507720947, "reward_std": 0.08164961636066437, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2298 }, { "completion_length": 992.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2892.0, "completions/max_terminated_length": 2892.0, "completions/mean_length": 992.3333740234375, "completions/mean_terminated_length": 992.3333740234375, "completions/min_length": 471.0, "completions/min_terminated_length": 471.0, "epoch": 0.7798507462686567, "frac_reward_zero_std": 0.0, "grad_norm": 0.3606627583503723, "kl": 0.0, "learning_rate": 1.1214630779848171e-07, "loss": -0.0105, "num_tokens": 69832989.0, "reward": 1.1875, "reward_std": 0.27556759119033813, "rewards/correctness_reward_func/mean": 0.9000000357627869, "rewards/correctness_reward_func/std": 0.28919950127601624, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 2299 }, { "completion_length": 2775.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 6420.0, "completions/max_terminated_length": 6420.0, "completions/mean_length": 2775.08349609375, "completions/mean_terminated_length": 2775.08349609375, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 0.7801899592944369, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.1197377501725328e-07, "loss": 0.0, "num_tokens": 69878608.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2300 }, { "completion_length": 706.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1261.0, "completions/max_terminated_length": 1261.0, "completions/mean_length": 706.1666870117188, "completions/mean_terminated_length": 706.1666870117188, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.7805291723202171, "frac_reward_zero_std": 1.0, "grad_norm": 1.0404102823713401e-07, "kl": 0.0, "learning_rate": 1.1180124223602484e-07, "loss": 0.0, "num_tokens": 69895848.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2301 }, { "completion_length": 759.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1619.0, "completions/max_terminated_length": 1619.0, "completions/mean_length": 759.9166870117188, "completions/mean_terminated_length": 759.9166870117188, "completions/min_length": 430.0, "completions/min_terminated_length": 430.0, "epoch": 0.7808683853459973, "frac_reward_zero_std": 0.5, "grad_norm": 0.08273164927959442, "kl": 0.0, "learning_rate": 1.1162870945479641e-07, "loss": -0.0003, "num_tokens": 69920681.0, "reward": 0.75, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.44999995827674866, "rewards/correctness_reward_func/std": 0.47577688097953796, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2302 }, { "completion_length": 1606.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3402.0, "completions/max_terminated_length": 3402.0, "completions/mean_length": 1606.75, "completions/mean_terminated_length": 1606.75, "completions/min_length": 504.0, "completions/min_terminated_length": 504.0, "epoch": 0.7812075983717774, "frac_reward_zero_std": 1.0, "grad_norm": 1.1805090593952627e-07, "kl": 0.0, "learning_rate": 1.1145617667356798e-07, "loss": 0.0, "num_tokens": 69953276.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2303 }, { "completion_length": 1484.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3372.0, "completions/max_terminated_length": 3372.0, "completions/mean_length": 1484.0833740234375, "completions/mean_terminated_length": 1484.0833740234375, "completions/min_length": 611.0, "completions/min_terminated_length": 611.0, "epoch": 0.7815468113975577, "frac_reward_zero_std": 0.0, "grad_norm": 0.6254962682723999, "kl": 0.0, "learning_rate": 1.1128364389233954e-07, "loss": 0.0117, "num_tokens": 69981513.0, "reward": 1.0333335399627686, "reward_std": 0.28828296065330505, "rewards/correctness_reward_func/mean": 0.7333332896232605, "rewards/correctness_reward_func/std": 0.3550501763820648, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2304 }, { "completion_length": 1113.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 1595.0, "completions/max_terminated_length": 1595.0, "completions/mean_length": 1113.75, "completions/mean_terminated_length": 1113.75, "completions/min_length": 814.0, "completions/min_terminated_length": 814.0, "epoch": 0.7818860244233379, "frac_reward_zero_std": 1.0, "grad_norm": 1.3184315150738257e-07, "kl": 0.0, "learning_rate": 1.111111111111111e-07, "loss": 0.0, "num_tokens": 70008000.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2305 }, { "completion_length": 1300.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2420.0, "completions/max_terminated_length": 2420.0, "completions/mean_length": 1300.5, "completions/mean_terminated_length": 1300.5, "completions/min_length": 866.0, "completions/min_terminated_length": 866.0, "epoch": 0.7822252374491181, "frac_reward_zero_std": 0.5, "grad_norm": 0.07448822259902954, "kl": 0.0, "learning_rate": 1.1093857832988266e-07, "loss": 0.0005, "num_tokens": 70033728.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2306 }, { "completion_length": 697.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1504.0, "completions/max_terminated_length": 1504.0, "completions/mean_length": 697.1666870117188, "completions/mean_terminated_length": 697.1666870117188, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 0.7825644504748982, "frac_reward_zero_std": 0.5, "grad_norm": 0.03813989460468292, "kl": 0.0, "learning_rate": 1.1076604554865424e-07, "loss": -0.0002, "num_tokens": 70052432.0, "reward": 0.6875001788139343, "reward_std": 0.03061862289905548, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 2307 }, { "completion_length": 459.4166717529297, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/max_terminated_length": 576.0, "completions/mean_length": 459.41668701171875, "completions/mean_terminated_length": 459.41668701171875, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 0.7829036635006784, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.1059351276742581e-07, "loss": 0.0, "num_tokens": 70070203.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2308 }, { "completion_length": 1141.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3210.0, "completions/max_terminated_length": 3210.0, "completions/mean_length": 1141.75, "completions/mean_terminated_length": 1141.75, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "epoch": 0.7832428765264586, "frac_reward_zero_std": 0.0, "grad_norm": 0.9833850264549255, "kl": 0.0, "learning_rate": 1.1042097998619737e-07, "loss": -0.0081, "num_tokens": 70096810.0, "reward": 0.7833334803581238, "reward_std": 0.38262733817100525, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.43029236793518066, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2309 }, { "completion_length": 1042.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2285.0, "completions/max_terminated_length": 2285.0, "completions/mean_length": 1042.416748046875, "completions/mean_terminated_length": 1042.416748046875, "completions/min_length": 489.0, "completions/min_terminated_length": 489.0, "epoch": 0.7835820895522388, "frac_reward_zero_std": 0.5, "grad_norm": 0.4546540081501007, "kl": 0.0, "learning_rate": 1.1024844720496894e-07, "loss": -0.0043, "num_tokens": 70120047.0, "reward": 1.0, "reward_std": 0.21908903121948242, "rewards/correctness_reward_func/mean": 0.699999988079071, "rewards/correctness_reward_func/std": 0.43064433336257935, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2310 }, { "completion_length": 1521.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2345.0, "completions/max_terminated_length": 2345.0, "completions/mean_length": 1521.5, "completions/mean_terminated_length": 1521.5, "completions/min_length": 907.0, "completions/min_terminated_length": 907.0, "epoch": 0.783921302578019, "frac_reward_zero_std": 0.5, "grad_norm": 0.08277913182973862, "kl": 0.0, "learning_rate": 1.100759144237405e-07, "loss": -0.0005, "num_tokens": 70144143.0, "reward": 1.2166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2311 }, { "completion_length": 2593.416748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6421.0, "completions/mean_length": 3142.5, "completions/mean_terminated_length": 2829.181884765625, "completions/min_length": 506.0, "completions/min_terminated_length": 506.0, "epoch": 0.7842605156037992, "frac_reward_zero_std": 0.0, "grad_norm": 0.10697486251592636, "kl": NaN, "learning_rate": 1.0990338164251207e-07, "loss": -0.0168, "num_tokens": 70187102.0, "reward": 0.7583333849906921, "reward_std": 0.10206204652786255, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 2312 }, { "completion_length": 1140.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1550.0, "completions/max_terminated_length": 1550.0, "completions/mean_length": 1140.0833740234375, "completions/mean_terminated_length": 1140.0833740234375, "completions/min_length": 668.0, "completions/min_terminated_length": 668.0, "epoch": 0.7845997286295794, "frac_reward_zero_std": 1.0, "grad_norm": 4.0352026076106995e-07, "kl": 0.0, "learning_rate": 1.0973084886128364e-07, "loss": 0.0, "num_tokens": 70214985.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2313 }, { "completion_length": 542.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 884.0, "completions/max_terminated_length": 884.0, "completions/mean_length": 542.1666870117188, "completions/mean_terminated_length": 542.1666870117188, "completions/min_length": 360.0, "completions/min_terminated_length": 360.0, "epoch": 0.7849389416553596, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.0955831608005522e-07, "loss": 0.0, "num_tokens": 70233209.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2314 }, { "completion_length": 640.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1017.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 640.4166870117188, "completions/mean_terminated_length": 640.4166870117188, "completions/min_length": 370.0, "completions/min_terminated_length": 370.0, "epoch": 0.7852781546811397, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.0938578329882678e-07, "loss": 0.0, "num_tokens": 70254160.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2315 }, { "completion_length": 1004.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1545.0, "completions/max_terminated_length": 1545.0, "completions/mean_length": 1004.4166870117188, "completions/mean_terminated_length": 1004.4166870117188, "completions/min_length": 517.0, "completions/min_terminated_length": 517.0, "epoch": 0.7856173677069199, "frac_reward_zero_std": 0.5, "grad_norm": 1.077035903930664, "kl": 0.0, "learning_rate": 1.0921325051759834e-07, "loss": 0.0004, "num_tokens": 70273737.0, "reward": 0.7833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2316 }, { "completion_length": 587.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 766.0, "completions/max_terminated_length": 766.0, "completions/mean_length": 587.3333740234375, "completions/mean_terminated_length": 587.3333740234375, "completions/min_length": 453.0, "completions/min_terminated_length": 453.0, "epoch": 0.7859565807327001, "frac_reward_zero_std": 0.5, "grad_norm": 0.05898209661245346, "kl": 0.0, "learning_rate": 1.090407177363699e-07, "loss": -0.0001, "num_tokens": 70293625.0, "reward": 0.7833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2317 }, { "completion_length": 2600.2501220703125, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4614.0, "completions/mean_length": 3149.33349609375, "completions/mean_terminated_length": 2836.636474609375, "completions/min_length": 1301.0, "completions/min_terminated_length": 1301.0, "epoch": 0.7862957937584804, "frac_reward_zero_std": 0.0, "grad_norm": 0.8913753628730774, "kl": NaN, "learning_rate": 1.0886818495514147e-07, "loss": -0.0214, "num_tokens": 70335832.0, "reward": 0.9250000715255737, "reward_std": 0.4538447856903076, "rewards/correctness_reward_func/mean": 0.6499999761581421, "rewards/correctness_reward_func/std": 0.40113475918769836, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 2318 }, { "completion_length": 985.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2104.0, "completions/max_terminated_length": 2104.0, "completions/mean_length": 985.5, "completions/mean_terminated_length": 985.5, "completions/min_length": 502.0, "completions/min_terminated_length": 502.0, "epoch": 0.7866350067842606, "frac_reward_zero_std": 0.5, "grad_norm": 0.41223669052124023, "kl": 0.0, "learning_rate": 1.0869565217391303e-07, "loss": -0.0011, "num_tokens": 70364548.0, "reward": 1.2000000476837158, "reward_std": 0.19999998807907104, "rewards/correctness_reward_func/mean": 0.9000000357627869, "rewards/correctness_reward_func/std": 0.28919950127601624, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2319 }, { "completion_length": 2798.3333435058594, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6482.0, "completions/mean_length": 3347.416748046875, "completions/mean_terminated_length": 3052.727294921875, "completions/min_length": 645.0, "completions/min_terminated_length": 645.0, "epoch": 0.7869742198100407, "frac_reward_zero_std": 0.5, "grad_norm": 0.6806624531745911, "kl": NaN, "learning_rate": 1.085231193926846e-07, "loss": -0.0682, "num_tokens": 70408142.0, "reward": 1.125, "reward_std": 0.23611436784267426, "rewards/correctness_reward_func/mean": 0.8499999642372131, "rewards/correctness_reward_func/std": 0.28444522619247437, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 2320 }, { "completion_length": 584.0833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 782.0, "completions/max_terminated_length": 782.0, "completions/mean_length": 584.0833740234375, "completions/mean_terminated_length": 584.0833740234375, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "epoch": 0.7873134328358209, "frac_reward_zero_std": 0.0, "grad_norm": 0.09985418617725372, "kl": 0.0, "learning_rate": 1.0835058661145617e-07, "loss": -0.0013, "num_tokens": 70427259.0, "reward": 1.183333396911621, "reward_std": 0.09246458858251572, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2321 }, { "completion_length": 1185.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2298.0, "completions/max_terminated_length": 2298.0, "completions/mean_length": 1185.916748046875, "completions/mean_terminated_length": 1185.916748046875, "completions/min_length": 642.0, "completions/min_terminated_length": 642.0, "epoch": 0.7876526458616011, "frac_reward_zero_std": 0.0, "grad_norm": 0.392078697681427, "kl": 0.0, "learning_rate": 1.0817805383022775e-07, "loss": -0.0056, "num_tokens": 70458494.0, "reward": 0.9166667461395264, "reward_std": 0.2599138617515564, "rewards/correctness_reward_func/mean": 0.6166666746139526, "rewards/correctness_reward_func/std": 0.37618499994277954, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2322 }, { "completion_length": 798.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1197.0, "completions/max_terminated_length": 1197.0, "completions/mean_length": 798.0833740234375, "completions/mean_terminated_length": 798.0833740234375, "completions/min_length": 690.0, "completions/min_terminated_length": 690.0, "epoch": 0.7879918588873813, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.0800552104899931e-07, "loss": 0.0, "num_tokens": 70482081.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2323 }, { "completion_length": 1226.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2549.0, "completions/max_terminated_length": 2549.0, "completions/mean_length": 1226.0833740234375, "completions/mean_terminated_length": 1226.0833740234375, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "epoch": 0.7883310719131614, "frac_reward_zero_std": 0.5, "grad_norm": 0.3598487973213196, "kl": 0.0, "learning_rate": 1.0783298826777088e-07, "loss": -0.0012, "num_tokens": 70510810.0, "reward": 1.0333335399627686, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.7333333492279053, "rewards/correctness_reward_func/std": 0.23094011843204498, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2324 }, { "completion_length": 1887.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4087.0, "completions/max_terminated_length": 4087.0, "completions/mean_length": 1887.0833740234375, "completions/mean_terminated_length": 1887.0833740234375, "completions/min_length": 395.0, "completions/min_terminated_length": 395.0, "epoch": 0.7886702849389416, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.0766045548654244e-07, "loss": 0.0, "num_tokens": 70546571.0, "reward": 0.30000001192092896, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2325 }, { "completion_length": 968.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3029.0, "completions/max_terminated_length": 3029.0, "completions/mean_length": 968.25, "completions/mean_terminated_length": 968.25, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "epoch": 0.7890094979647219, "frac_reward_zero_std": 1.0, "grad_norm": 3.070735772325861e-07, "kl": 0.0, "learning_rate": 1.0748792270531401e-07, "loss": 0.0, "num_tokens": 70572824.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2326 }, { "completion_length": 1769.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4618.0, "completions/max_terminated_length": 4618.0, "completions/mean_length": 1769.666748046875, "completions/mean_terminated_length": 1769.666748046875, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "epoch": 0.7893487109905021, "frac_reward_zero_std": 0.0, "grad_norm": 0.698104202747345, "kl": 0.0, "learning_rate": 1.0731538992408558e-07, "loss": -0.0323, "num_tokens": 70609168.0, "reward": 1.0, "reward_std": 0.2581988573074341, "rewards/correctness_reward_func/mean": 0.699999988079071, "rewards/correctness_reward_func/std": 0.3357488214969635, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2327 }, { "completion_length": 696.0833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 973.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 696.0833740234375, "completions/mean_terminated_length": 696.0833740234375, "completions/min_length": 502.0, "completions/min_terminated_length": 502.0, "epoch": 0.7896879240162822, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.0714285714285713e-07, "loss": 0.0, "num_tokens": 70631351.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2328 }, { "completion_length": 1050.1667175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 1577.0, "completions/max_terminated_length": 1577.0, "completions/mean_length": 1050.166748046875, "completions/mean_terminated_length": 1050.166748046875, "completions/min_length": 569.0, "completions/min_terminated_length": 569.0, "epoch": 0.7900271370420624, "frac_reward_zero_std": 0.0, "grad_norm": 0.41347283124923706, "kl": 0.0, "learning_rate": 1.069703243616287e-07, "loss": -0.0027, "num_tokens": 70655833.0, "reward": 1.0166666507720947, "reward_std": 0.26133137941360474, "rewards/correctness_reward_func/mean": 0.7166666984558105, "rewards/correctness_reward_func/std": 0.3459725081920624, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2329 }, { "completion_length": 1246.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1788.0, "completions/max_terminated_length": 1788.0, "completions/mean_length": 1246.416748046875, "completions/mean_terminated_length": 1246.416748046875, "completions/min_length": 916.0, "completions/min_terminated_length": 916.0, "epoch": 0.7903663500678426, "frac_reward_zero_std": 1.0, "grad_norm": 2.5109423518188123e-07, "kl": 0.0, "learning_rate": 1.0679779158040027e-07, "loss": 0.0, "num_tokens": 70683552.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2330 }, { "completion_length": 662.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 984.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 662.75, "completions/mean_terminated_length": 662.75, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "epoch": 0.7907055630936228, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.0662525879917184e-07, "loss": 0.0, "num_tokens": 70702821.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2331 }, { "completion_length": 589.5000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 769.0, "completions/max_terminated_length": 769.0, "completions/mean_length": 589.5, "completions/mean_terminated_length": 589.5, "completions/min_length": 433.0, "completions/min_terminated_length": 433.0, "epoch": 0.7910447761194029, "frac_reward_zero_std": 1.0, "grad_norm": 9.992771055067351e-08, "kl": 0.0, "learning_rate": 1.064527260179434e-07, "loss": 0.0, "num_tokens": 70716771.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2332 }, { "completion_length": 1025.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2343.0, "completions/max_terminated_length": 2343.0, "completions/mean_length": 1025.25, "completions/mean_terminated_length": 1025.25, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 0.7913839891451832, "frac_reward_zero_std": 0.0, "grad_norm": 0.4719524383544922, "kl": 0.0, "learning_rate": 1.0628019323671497e-07, "loss": -0.0061, "num_tokens": 70736850.0, "reward": 1.0500000715255737, "reward_std": 0.29902371764183044, "rewards/correctness_reward_func/mean": 0.75, "rewards/correctness_reward_func/std": 0.36306771636009216, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2333 }, { "completion_length": 1778.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 6403.0, "completions/max_terminated_length": 6403.0, "completions/mean_length": 1778.666748046875, "completions/mean_terminated_length": 1778.666748046875, "completions/min_length": 387.0, "completions/min_terminated_length": 387.0, "epoch": 0.7917232021709634, "frac_reward_zero_std": 0.5, "grad_norm": 0.7560432553291321, "kl": 0.0, "learning_rate": 1.0610766045548654e-07, "loss": -0.0333, "num_tokens": 70772438.0, "reward": 0.8666666746139526, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.5666666626930237, "rewards/correctness_reward_func/std": 0.5033223032951355, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2334 }, { "completion_length": 951.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1813.0, "completions/max_terminated_length": 1813.0, "completions/mean_length": 951.8333740234375, "completions/mean_terminated_length": 951.8333740234375, "completions/min_length": 533.0, "completions/min_terminated_length": 533.0, "epoch": 0.7920624151967436, "frac_reward_zero_std": 0.5, "grad_norm": 0.057795215398073196, "kl": 0.0, "learning_rate": 1.059351276742581e-07, "loss": 0.0001, "num_tokens": 70798068.0, "reward": 1.2166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2335 }, { "completion_length": 800.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1138.0, "completions/max_terminated_length": 1138.0, "completions/mean_length": 800.1666870117188, "completions/mean_terminated_length": 800.1666870117188, "completions/min_length": 542.0, "completions/min_terminated_length": 542.0, "epoch": 0.7924016282225237, "frac_reward_zero_std": 1.0, "grad_norm": 8.407631213458444e-08, "kl": 0.0, "learning_rate": 1.0576259489302967e-07, "loss": 0.0, "num_tokens": 70818098.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2336 }, { "completion_length": 1026.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1363.0, "completions/max_terminated_length": 1363.0, "completions/mean_length": 1026.416748046875, "completions/mean_terminated_length": 1026.416748046875, "completions/min_length": 688.0, "completions/min_terminated_length": 688.0, "epoch": 0.7927408412483039, "frac_reward_zero_std": 1.0, "grad_norm": 1.6520888834747893e-07, "kl": 0.0, "learning_rate": 1.0559006211180124e-07, "loss": 0.0, "num_tokens": 70843885.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2337 }, { "completion_length": 608.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 769.0, "completions/max_terminated_length": 769.0, "completions/mean_length": 608.25, "completions/mean_terminated_length": 608.25, "completions/min_length": 411.0, "completions/min_terminated_length": 411.0, "epoch": 0.7930800542740841, "frac_reward_zero_std": 0.5, "grad_norm": 0.08355899900197983, "kl": 0.0, "learning_rate": 1.0541752933057282e-07, "loss": -0.0002, "num_tokens": 70861678.0, "reward": 1.1666667461395264, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2338 }, { "completion_length": 1858.2501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 3187.0, "completions/max_terminated_length": 3187.0, "completions/mean_length": 1858.25, "completions/mean_terminated_length": 1858.25, "completions/min_length": 646.0, "completions/min_terminated_length": 646.0, "epoch": 0.7934192672998643, "frac_reward_zero_std": 0.0, "grad_norm": 0.7079671621322632, "kl": 0.0, "learning_rate": 1.0524499654934437e-07, "loss": -0.0082, "num_tokens": 70894717.0, "reward": 0.9000000357627869, "reward_std": 0.2892930209636688, "rewards/correctness_reward_func/mean": 0.5999999642372131, "rewards/correctness_reward_func/std": 0.45126086473464966, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2339 }, { "completion_length": 1134.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2451.0, "completions/max_terminated_length": 2451.0, "completions/mean_length": 1134.8333740234375, "completions/mean_terminated_length": 1134.8333740234375, "completions/min_length": 650.0, "completions/min_terminated_length": 650.0, "epoch": 0.7937584803256446, "frac_reward_zero_std": 1.0, "grad_norm": 9.32706072376277e-08, "kl": 0.0, "learning_rate": 1.0507246376811593e-07, "loss": 0.0, "num_tokens": 70918043.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2340 }, { "completion_length": 1489.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3732.0, "completions/max_terminated_length": 3732.0, "completions/mean_length": 1489.25, "completions/mean_terminated_length": 1489.25, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 0.7940976933514247, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.048999309868875e-07, "loss": 0.0, "num_tokens": 70950734.0, "reward": 0.30000001192092896, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2341 }, { "completion_length": 1910.75, "completions/clipped_ratio": 0.0, "completions/max_length": 6331.0, "completions/max_terminated_length": 6331.0, "completions/mean_length": 1910.75, "completions/mean_terminated_length": 1910.75, "completions/min_length": 421.0, "completions/min_terminated_length": 421.0, "epoch": 0.7944369063772049, "frac_reward_zero_std": 0.0, "grad_norm": 0.65699702501297, "kl": 0.0, "learning_rate": 1.0472739820565907e-07, "loss": -0.0177, "num_tokens": 70987109.0, "reward": 1.0333333015441895, "reward_std": 0.28828293085098267, "rewards/correctness_reward_func/mean": 0.7333332896232605, "rewards/correctness_reward_func/std": 0.3550501763820648, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2342 }, { "completion_length": 668.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1188.0, "completions/max_terminated_length": 1188.0, "completions/mean_length": 668.25, "completions/mean_terminated_length": 668.25, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "epoch": 0.7947761194029851, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.0455486542443063e-07, "loss": 0.0, "num_tokens": 71008424.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2343 }, { "completion_length": 1315.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3017.0, "completions/max_terminated_length": 3017.0, "completions/mean_length": 1315.5, "completions/mean_terminated_length": 1315.5, "completions/min_length": 452.0, "completions/min_terminated_length": 452.0, "epoch": 0.7951153324287653, "frac_reward_zero_std": 0.5, "grad_norm": 0.10511545091867447, "kl": 0.0, "learning_rate": 1.043823326432022e-07, "loss": -0.0005, "num_tokens": 71033942.0, "reward": 1.2166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2344 }, { "completion_length": 653.5833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1586.0, "completions/max_terminated_length": 1586.0, "completions/mean_length": 653.5833740234375, "completions/mean_terminated_length": 653.5833740234375, "completions/min_length": 405.0, "completions/min_terminated_length": 405.0, "epoch": 0.7954545454545454, "frac_reward_zero_std": 0.5, "grad_norm": 0.06526116281747818, "kl": 0.0, "learning_rate": 1.0420979986197376e-07, "loss": 0.0006, "num_tokens": 71056017.0, "reward": 0.7666666507720947, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2345 }, { "completion_length": 1020.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2008.0, "completions/max_terminated_length": 2008.0, "completions/mean_length": 1020.25, "completions/mean_terminated_length": 1020.25, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.7957937584803256, "frac_reward_zero_std": 1.0, "grad_norm": 1.2400761306707864e-07, "kl": 0.0, "learning_rate": 1.0403726708074534e-07, "loss": 0.0, "num_tokens": 71083722.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2346 }, { "completion_length": 1557.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3142.0, "completions/max_terminated_length": 3142.0, "completions/mean_length": 1557.3333740234375, "completions/mean_terminated_length": 1557.3333740234375, "completions/min_length": 718.0, "completions/min_terminated_length": 718.0, "epoch": 0.7961329715061058, "frac_reward_zero_std": 0.5, "grad_norm": 0.5018320679664612, "kl": 0.0, "learning_rate": 1.0386473429951691e-07, "loss": 0.0106, "num_tokens": 71114464.0, "reward": 0.6333333253860474, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.3333333432674408, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2347 }, { "completion_length": 2239.8334350585938, "completions/clipped_ratio": 0.0, "completions/max_length": 3843.0, "completions/max_terminated_length": 3843.0, "completions/mean_length": 2239.83349609375, "completions/mean_terminated_length": 2239.83349609375, "completions/min_length": 1123.0, "completions/min_terminated_length": 1123.0, "epoch": 0.7964721845318861, "frac_reward_zero_std": 0.5, "grad_norm": 0.12048373371362686, "kl": 0.0, "learning_rate": 1.0369220151828848e-07, "loss": 0.0005, "num_tokens": 71150036.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2348 }, { "completion_length": 1038.9167175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 2052.0, "completions/max_terminated_length": 2052.0, "completions/mean_length": 1038.916748046875, "completions/mean_terminated_length": 1038.916748046875, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "epoch": 0.7968113975576662, "frac_reward_zero_std": 0.0, "grad_norm": 0.10352325439453125, "kl": 0.0, "learning_rate": 1.0351966873706004e-07, "loss": 0.0005, "num_tokens": 71171815.0, "reward": 1.2000000476837158, "reward_std": 0.08164963126182556, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2349 }, { "completion_length": 815.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1226.0, "completions/max_terminated_length": 1226.0, "completions/mean_length": 815.6666870117188, "completions/mean_terminated_length": 815.6666870117188, "completions/min_length": 499.0, "completions/min_terminated_length": 499.0, "epoch": 0.7971506105834464, "frac_reward_zero_std": 0.5, "grad_norm": 0.06449928134679794, "kl": 0.0, "learning_rate": 1.0334713595583161e-07, "loss": 0.0002, "num_tokens": 71193507.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2350 }, { "completion_length": 2024.8333740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 3402.0, "completions/mean_length": 2573.916748046875, "completions/mean_terminated_length": 2208.9091796875, "completions/min_length": 1193.0, "completions/min_terminated_length": 1193.0, "epoch": 0.7974898236092266, "frac_reward_zero_std": 0.0, "grad_norm": 0.25730282068252563, "kl": NaN, "learning_rate": 1.0317460317460316e-07, "loss": -0.018, "num_tokens": 71231953.0, "reward": 1.0875000953674316, "reward_std": 0.2675197720527649, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.2696799635887146, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 2351 }, { "completion_length": 547.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 809.0, "completions/max_terminated_length": 809.0, "completions/mean_length": 547.9166870117188, "completions/mean_terminated_length": 547.9166870117188, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "epoch": 0.7978290366350068, "frac_reward_zero_std": 0.0, "grad_norm": 0.12381388247013092, "kl": 0.0, "learning_rate": 1.0300207039337473e-07, "loss": -0.0004, "num_tokens": 71247936.0, "reward": 1.2166666984558105, "reward_std": 0.09246456623077393, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298573225736618, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2352 }, { "completion_length": 733.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1179.0, "completions/max_terminated_length": 1179.0, "completions/mean_length": 733.9166870117188, "completions/mean_terminated_length": 733.9166870117188, "completions/min_length": 489.0, "completions/min_terminated_length": 489.0, "epoch": 0.7981682496607869, "frac_reward_zero_std": 1.0, "grad_norm": 2.2148809364352928e-07, "kl": 0.0, "learning_rate": 1.028295376121463e-07, "loss": 0.0, "num_tokens": 71264873.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2353 }, { "completion_length": 1402.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2607.0, "completions/max_terminated_length": 2607.0, "completions/mean_length": 1402.5, "completions/mean_terminated_length": 1402.5, "completions/min_length": 422.0, "completions/min_terminated_length": 422.0, "epoch": 0.7985074626865671, "frac_reward_zero_std": 0.5, "grad_norm": 0.6508964896202087, "kl": 0.0, "learning_rate": 1.0265700483091787e-07, "loss": -0.0127, "num_tokens": 71292875.0, "reward": 0.9500000476837158, "reward_std": 0.27386125922203064, "rewards/correctness_reward_func/mean": 0.6499999761581421, "rewards/correctness_reward_func/std": 0.40113475918769836, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2354 }, { "completion_length": 574.5000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 809.0, "completions/max_terminated_length": 809.0, "completions/mean_length": 574.5, "completions/mean_terminated_length": 574.5, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "epoch": 0.7988466757123474, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.0248447204968944e-07, "loss": 0.0, "num_tokens": 71309765.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2355 }, { "completion_length": 442.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 816.0, "completions/max_terminated_length": 816.0, "completions/mean_length": 442.8333435058594, "completions/mean_terminated_length": 442.8333435058594, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.7991858887381276, "frac_reward_zero_std": 0.5, "grad_norm": 0.07335955649614334, "kl": 0.0, "learning_rate": 1.02311939268461e-07, "loss": 0.0001, "num_tokens": 71328153.0, "reward": 1.2333333492279053, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.9333333373069763, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2356 }, { "completion_length": 616.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1062.0, "completions/max_terminated_length": 1062.0, "completions/mean_length": 616.3333740234375, "completions/mean_terminated_length": 616.3333740234375, "completions/min_length": 372.0, "completions/min_terminated_length": 372.0, "epoch": 0.7995251017639078, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.0213940648723257e-07, "loss": 0.0, "num_tokens": 71349205.0, "reward": 0.30000001192092896, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2357 }, { "completion_length": 716.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1388.0, "completions/max_terminated_length": 1388.0, "completions/mean_length": 716.8333740234375, "completions/mean_terminated_length": 716.8333740234375, "completions/min_length": 439.0, "completions/min_terminated_length": 439.0, "epoch": 0.7998643147896879, "frac_reward_zero_std": 0.5, "grad_norm": 0.09145110845565796, "kl": 0.0, "learning_rate": 1.0196687370600414e-07, "loss": 0.0009, "num_tokens": 71368655.0, "reward": 1.1666667461395264, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2358 }, { "completion_length": 710.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1154.0, "completions/max_terminated_length": 1154.0, "completions/mean_length": 710.4166870117188, "completions/mean_terminated_length": 710.4166870117188, "completions/min_length": 467.0, "completions/min_terminated_length": 467.0, "epoch": 0.8002035278154681, "frac_reward_zero_std": 0.5, "grad_norm": 0.06413047760725021, "kl": 0.0, "learning_rate": 1.017943409247757e-07, "loss": 0.0002, "num_tokens": 71390926.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2359 }, { "completion_length": 745.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2552.0, "completions/max_terminated_length": 2552.0, "completions/mean_length": 745.1666870117188, "completions/mean_terminated_length": 745.1666870117188, "completions/min_length": 445.0, "completions/min_terminated_length": 445.0, "epoch": 0.8005427408412483, "frac_reward_zero_std": 0.5, "grad_norm": 0.07556004822254181, "kl": 0.0, "learning_rate": 1.0162180814354727e-07, "loss": -0.0007, "num_tokens": 71409516.0, "reward": 1.133333444595337, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2360 }, { "completion_length": 1647.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 4227.0, "completions/max_terminated_length": 4227.0, "completions/mean_length": 1647.166748046875, "completions/mean_terminated_length": 1647.166748046875, "completions/min_length": 440.0, "completions/min_terminated_length": 440.0, "epoch": 0.8008819538670285, "frac_reward_zero_std": 1.0, "grad_norm": 2.5301665118604433e-07, "kl": 0.0, "learning_rate": 1.0144927536231885e-07, "loss": 0.0, "num_tokens": 71440742.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2361 }, { "completion_length": 748.5833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1026.0, "completions/max_terminated_length": 1026.0, "completions/mean_length": 748.5833740234375, "completions/mean_terminated_length": 748.5833740234375, "completions/min_length": 540.0, "completions/min_terminated_length": 540.0, "epoch": 0.8012211668928086, "frac_reward_zero_std": 0.5, "grad_norm": 0.3156207799911499, "kl": 0.0, "learning_rate": 1.012767425810904e-07, "loss": 0.002, "num_tokens": 71458203.0, "reward": 0.9333333373069763, "reward_std": 0.20655912160873413, "rewards/correctness_reward_func/mean": 0.6333333253860474, "rewards/correctness_reward_func/std": 0.4735424220561981, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2362 }, { "completion_length": 2468.08349609375, "completions/clipped_ratio": 0.0, "completions/max_length": 4757.0, "completions/max_terminated_length": 4757.0, "completions/mean_length": 2468.08349609375, "completions/mean_terminated_length": 2468.08349609375, "completions/min_length": 567.0, "completions/min_terminated_length": 567.0, "epoch": 0.8015603799185889, "frac_reward_zero_std": 0.5, "grad_norm": 0.14372634887695312, "kl": 0.0, "learning_rate": 1.0110420979986197e-07, "loss": -0.0009, "num_tokens": 71495020.0, "reward": 1.2666666507720947, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2363 }, { "completion_length": 1348.25, "completions/clipped_ratio": 0.0, "completions/max_length": 4945.0, "completions/max_terminated_length": 4945.0, "completions/mean_length": 1348.25, "completions/mean_terminated_length": 1348.25, "completions/min_length": 589.0, "completions/min_terminated_length": 589.0, "epoch": 0.8018995929443691, "frac_reward_zero_std": 0.5, "grad_norm": 0.5952526926994324, "kl": 0.0, "learning_rate": 1.0093167701863353e-07, "loss": 0.0189, "num_tokens": 71521777.0, "reward": 1.0, "reward_std": 0.21908903121948242, "rewards/correctness_reward_func/mean": 0.699999988079071, "rewards/correctness_reward_func/std": 0.43064433336257935, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2364 }, { "completion_length": 653.0833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1076.0, "completions/max_terminated_length": 1076.0, "completions/mean_length": 653.0833740234375, "completions/mean_terminated_length": 653.0833740234375, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "epoch": 0.8022388059701493, "frac_reward_zero_std": 1.0, "grad_norm": 1.9057215183693188e-07, "kl": 0.0, "learning_rate": 1.007591442374051e-07, "loss": 0.0, "num_tokens": 71544092.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2365 }, { "completion_length": 1776.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 5345.0, "completions/max_terminated_length": 5345.0, "completions/mean_length": 1776.416748046875, "completions/mean_terminated_length": 1776.416748046875, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.8025780189959294, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.0058661145617667e-07, "loss": 0.0, "num_tokens": 71578123.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2366 }, { "completion_length": 913.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2429.0, "completions/max_terminated_length": 2429.0, "completions/mean_length": 913.8333740234375, "completions/mean_terminated_length": 913.8333740234375, "completions/min_length": 519.0, "completions/min_terminated_length": 519.0, "epoch": 0.8029172320217096, "frac_reward_zero_std": 1.0, "grad_norm": 1.5469399272660667e-07, "kl": 0.0, "learning_rate": 1.0041407867494823e-07, "loss": 0.0, "num_tokens": 71598767.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2367 }, { "completion_length": 1538.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 6143.0, "completions/max_terminated_length": 6143.0, "completions/mean_length": 1538.3333740234375, "completions/mean_terminated_length": 1538.3333740234375, "completions/min_length": 453.0, "completions/min_terminated_length": 453.0, "epoch": 0.8032564450474898, "frac_reward_zero_std": 0.5, "grad_norm": 1.2080028057098389, "kl": 0.0, "learning_rate": 1.002415458937198e-07, "loss": 0.0454, "num_tokens": 71628663.0, "reward": 1.1166667938232422, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.8166666030883789, "rewards/correctness_reward_func/std": 0.27579087018966675, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2368 }, { "completion_length": 1659.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3345.0, "completions/max_terminated_length": 3345.0, "completions/mean_length": 1659.5833740234375, "completions/mean_terminated_length": 1659.5833740234375, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "epoch": 0.80359565807327, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.0006901311249138e-07, "loss": 0.0, "num_tokens": 71659990.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2369 }, { "completion_length": 589.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 841.0, "completions/max_terminated_length": 841.0, "completions/mean_length": 589.8333740234375, "completions/mean_terminated_length": 589.8333740234375, "completions/min_length": 360.0, "completions/min_terminated_length": 360.0, "epoch": 0.8039348710990502, "frac_reward_zero_std": 1.0, "grad_norm": 9.122353361590285e-08, "kl": 0.0, "learning_rate": 9.989648033126294e-08, "loss": 0.0, "num_tokens": 71681180.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2370 }, { "completion_length": 1469.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3013.0, "completions/max_terminated_length": 3013.0, "completions/mean_length": 1469.166748046875, "completions/mean_terminated_length": 1469.166748046875, "completions/min_length": 768.0, "completions/min_terminated_length": 768.0, "epoch": 0.8042740841248304, "frac_reward_zero_std": 0.5, "grad_norm": 0.6247000694274902, "kl": 0.0, "learning_rate": 9.972394755003451e-08, "loss": 0.0195, "num_tokens": 71710570.0, "reward": 0.45000001788139343, "reward_std": 0.2345207929611206, "rewards/correctness_reward_func/mean": 0.14999999105930328, "rewards/correctness_reward_func/std": 0.35290998220443726, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2371 }, { "completion_length": 744.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1039.0, "completions/max_terminated_length": 1039.0, "completions/mean_length": 744.5833740234375, "completions/mean_terminated_length": 744.5833740234375, "completions/min_length": 548.0, "completions/min_terminated_length": 548.0, "epoch": 0.8046132971506106, "frac_reward_zero_std": 0.5, "grad_norm": 0.0662793293595314, "kl": 0.0, "learning_rate": 9.955141476880608e-08, "loss": 0.0001, "num_tokens": 71729321.0, "reward": 0.7833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2372 }, { "completion_length": 729.2500305175781, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 1648.0, "completions/mean_length": 1278.3333740234375, "completions/mean_terminated_length": 795.5454711914062, "completions/min_length": 454.0, "completions/min_terminated_length": 454.0, "epoch": 0.8049525101763908, "frac_reward_zero_std": 0.0, "grad_norm": 0.19801044464111328, "kl": NaN, "learning_rate": 9.937888198757763e-08, "loss": -0.013, "num_tokens": 71751722.0, "reward": 1.058333396911621, "reward_std": 0.2877541482448578, "rewards/correctness_reward_func/mean": 0.7833333015441895, "rewards/correctness_reward_func/std": 0.26227444410324097, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 2373 }, { "completion_length": 881.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1526.0, "completions/max_terminated_length": 1526.0, "completions/mean_length": 881.3333740234375, "completions/mean_terminated_length": 881.3333740234375, "completions/min_length": 532.0, "completions/min_terminated_length": 532.0, "epoch": 0.805291723202171, "frac_reward_zero_std": 0.5, "grad_norm": 0.07455834746360779, "kl": 0.0, "learning_rate": 9.92063492063492e-08, "loss": 0.0003, "num_tokens": 71771718.0, "reward": 1.2166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298573225736618, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2374 }, { "completion_length": 910.9167175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 1661.0, "completions/max_terminated_length": 1661.0, "completions/mean_length": 910.9166870117188, "completions/mean_terminated_length": 910.9166870117188, "completions/min_length": 448.0, "completions/min_terminated_length": 448.0, "epoch": 0.8056309362279511, "frac_reward_zero_std": 0.0, "grad_norm": 0.3829873502254486, "kl": 0.0, "learning_rate": 9.903381642512076e-08, "loss": -0.004, "num_tokens": 71799881.0, "reward": 0.7041666507720947, "reward_std": 0.23474276065826416, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.5149286985397339, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 2375 }, { "completion_length": 1119.9167175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 2726.0, "completions/max_terminated_length": 2726.0, "completions/mean_length": 1119.916748046875, "completions/mean_terminated_length": 1119.916748046875, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 0.8059701492537313, "frac_reward_zero_std": 0.5, "grad_norm": 0.5393698811531067, "kl": 0.0, "learning_rate": 9.886128364389233e-08, "loss": -0.0169, "num_tokens": 71825590.0, "reward": 0.5666667222976685, "reward_std": 0.20655912160873413, "rewards/correctness_reward_func/mean": 0.2666666805744171, "rewards/correctness_reward_func/std": 0.393892765045166, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2376 }, { "completion_length": 1411.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3479.0, "completions/max_terminated_length": 3479.0, "completions/mean_length": 1411.416748046875, "completions/mean_terminated_length": 1411.416748046875, "completions/min_length": 605.0, "completions/min_terminated_length": 605.0, "epoch": 0.8063093622795116, "frac_reward_zero_std": 0.5, "grad_norm": 0.4091227352619171, "kl": 0.0, "learning_rate": 9.86887508626639e-08, "loss": -0.01, "num_tokens": 71851839.0, "reward": 1.0333335399627686, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.7333333492279053, "rewards/correctness_reward_func/std": 0.23094011843204498, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2377 }, { "completion_length": 1261.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 4433.0, "completions/max_terminated_length": 4433.0, "completions/mean_length": 1261.416748046875, "completions/mean_terminated_length": 1261.416748046875, "completions/min_length": 494.0, "completions/min_terminated_length": 494.0, "epoch": 0.8066485753052918, "frac_reward_zero_std": 0.5, "grad_norm": 0.11241351068019867, "kl": 0.0, "learning_rate": 9.851621808143547e-08, "loss": 0.0027, "num_tokens": 71877434.0, "reward": 0.7333333492279053, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.43333330750465393, "rewards/correctness_reward_func/std": 0.45792683959007263, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2378 }, { "completion_length": 1897.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3504.0, "completions/max_terminated_length": 3504.0, "completions/mean_length": 1897.666748046875, "completions/mean_terminated_length": 1897.666748046875, "completions/min_length": 861.0, "completions/min_terminated_length": 861.0, "epoch": 0.8069877883310719, "frac_reward_zero_std": 0.5, "grad_norm": 0.49451154470443726, "kl": 0.0, "learning_rate": 9.834368530020704e-08, "loss": 0.0006, "num_tokens": 71912026.0, "reward": 0.6333333253860474, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.3333333432674408, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2379 }, { "completion_length": 1360.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2448.0, "completions/max_terminated_length": 2448.0, "completions/mean_length": 1360.0, "completions/mean_terminated_length": 1360.0, "completions/min_length": 914.0, "completions/min_terminated_length": 914.0, "epoch": 0.8073270013568521, "frac_reward_zero_std": 1.0, "grad_norm": 1.696111269211542e-07, "kl": 0.0, "learning_rate": 9.81711525189786e-08, "loss": 0.0, "num_tokens": 71942848.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2380 }, { "completion_length": 791.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1379.0, "completions/max_terminated_length": 1379.0, "completions/mean_length": 791.3333740234375, "completions/mean_terminated_length": 791.3333740234375, "completions/min_length": 366.0, "completions/min_terminated_length": 366.0, "epoch": 0.8076662143826323, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 9.799861973775017e-08, "loss": 0.0, "num_tokens": 71967482.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2381 }, { "completion_length": 968.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2633.0, "completions/max_terminated_length": 2633.0, "completions/mean_length": 968.25, "completions/mean_terminated_length": 968.25, "completions/min_length": 483.0, "completions/min_terminated_length": 483.0, "epoch": 0.8080054274084125, "frac_reward_zero_std": 0.5, "grad_norm": 0.08027996867895126, "kl": 0.0, "learning_rate": 9.782608695652174e-08, "loss": 0.0, "num_tokens": 71987693.0, "reward": 0.7833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2382 }, { "completion_length": 835.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1833.0, "completions/max_terminated_length": 1833.0, "completions/mean_length": 835.3333740234375, "completions/mean_terminated_length": 835.3333740234375, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 0.8083446404341926, "frac_reward_zero_std": 1.0, "grad_norm": 1.1954676892855787e-07, "kl": 0.0, "learning_rate": 9.76535541752933e-08, "loss": 0.0, "num_tokens": 72009465.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2383 }, { "completion_length": 1304.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2092.0, "completions/max_terminated_length": 2092.0, "completions/mean_length": 1304.916748046875, "completions/mean_terminated_length": 1304.916748046875, "completions/min_length": 670.0, "completions/min_terminated_length": 670.0, "epoch": 0.8086838534599728, "frac_reward_zero_std": 0.5, "grad_norm": 0.34954652190208435, "kl": 0.0, "learning_rate": 9.748102139406487e-08, "loss": -0.0049, "num_tokens": 72042182.0, "reward": 1.0333335399627686, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.7333333492279053, "rewards/correctness_reward_func/std": 0.23094011843204498, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2384 }, { "completion_length": 887.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2821.0, "completions/max_terminated_length": 2821.0, "completions/mean_length": 887.1666870117188, "completions/mean_terminated_length": 887.1666870117188, "completions/min_length": 458.0, "completions/min_terminated_length": 458.0, "epoch": 0.8090230664857531, "frac_reward_zero_std": 0.5, "grad_norm": 0.5617310404777527, "kl": 0.0, "learning_rate": 9.730848861283643e-08, "loss": 0.0164, "num_tokens": 72060130.0, "reward": 0.9666666388511658, "reward_std": 0.2581988573074341, "rewards/correctness_reward_func/mean": 0.6666666865348816, "rewards/correctness_reward_func/std": 0.4923659861087799, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2385 }, { "completion_length": 693.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 989.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 693.1666870117188, "completions/mean_terminated_length": 693.1666870117188, "completions/min_length": 516.0, "completions/min_terminated_length": 516.0, "epoch": 0.8093622795115333, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 9.7135955831608e-08, "loss": 0.0, "num_tokens": 72077910.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2386 }, { "completion_length": 865.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1911.0, "completions/max_terminated_length": 1911.0, "completions/mean_length": 865.6666870117188, "completions/mean_terminated_length": 865.6666870117188, "completions/min_length": 518.0, "completions/min_terminated_length": 518.0, "epoch": 0.8097014925373134, "frac_reward_zero_std": 0.0, "grad_norm": 0.4872001111507416, "kl": 0.0, "learning_rate": 9.696342305037957e-08, "loss": -0.0099, "num_tokens": 72101750.0, "reward": 0.949999988079071, "reward_std": 0.29902368783950806, "rewards/correctness_reward_func/mean": 0.6500000357627869, "rewards/correctness_reward_func/std": 0.4833594858646393, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2387 }, { "completion_length": 707.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1240.0, "completions/max_terminated_length": 1240.0, "completions/mean_length": 707.6666870117188, "completions/mean_terminated_length": 707.6666870117188, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "epoch": 0.8100407055630936, "frac_reward_zero_std": 1.0, "grad_norm": 1.8968903248151037e-07, "kl": 0.0, "learning_rate": 9.679089026915113e-08, "loss": 0.0, "num_tokens": 72124012.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2388 }, { "completion_length": 1500.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3153.0, "completions/max_terminated_length": 3153.0, "completions/mean_length": 1500.25, "completions/mean_terminated_length": 1500.25, "completions/min_length": 421.0, "completions/min_terminated_length": 421.0, "epoch": 0.8103799185888738, "frac_reward_zero_std": 1.0, "grad_norm": 9.733233952147202e-08, "kl": 0.0, "learning_rate": 9.66183574879227e-08, "loss": 0.0, "num_tokens": 72151873.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2389 }, { "completion_length": 1136.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 5212.0, "completions/max_terminated_length": 5212.0, "completions/mean_length": 1136.416748046875, "completions/mean_terminated_length": 1136.416748046875, "completions/min_length": 480.0, "completions/min_terminated_length": 480.0, "epoch": 0.810719131614654, "frac_reward_zero_std": 0.5, "grad_norm": 0.6588881015777588, "kl": 0.0, "learning_rate": 9.644582470669426e-08, "loss": 0.0184, "num_tokens": 72178260.0, "reward": 0.8666667342185974, "reward_std": 0.2581988573074341, "rewards/correctness_reward_func/mean": 0.5666666626930237, "rewards/correctness_reward_func/std": 0.42497774958610535, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2390 }, { "completion_length": 869.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1321.0, "completions/max_terminated_length": 1321.0, "completions/mean_length": 869.1666870117188, "completions/mean_terminated_length": 869.1666870117188, "completions/min_length": 573.0, "completions/min_terminated_length": 573.0, "epoch": 0.8110583446404342, "frac_reward_zero_std": 0.5, "grad_norm": 0.07449919730424881, "kl": 0.0, "learning_rate": 9.627329192546583e-08, "loss": -0.0012, "num_tokens": 72203090.0, "reward": 1.1666667461395264, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2391 }, { "completion_length": 1503.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2791.0, "completions/max_terminated_length": 2791.0, "completions/mean_length": 1503.916748046875, "completions/mean_terminated_length": 1503.916748046875, "completions/min_length": 604.0, "completions/min_terminated_length": 604.0, "epoch": 0.8113975576662144, "frac_reward_zero_std": 0.5, "grad_norm": 0.08549807220697403, "kl": 0.0, "learning_rate": 9.610075914423741e-08, "loss": -0.001, "num_tokens": 72231397.0, "reward": 0.7166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2392 }, { "completion_length": 786.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1200.0, "completions/max_terminated_length": 1200.0, "completions/mean_length": 786.9166870117188, "completions/mean_terminated_length": 786.9166870117188, "completions/min_length": 532.0, "completions/min_terminated_length": 532.0, "epoch": 0.8117367706919946, "frac_reward_zero_std": 0.5, "grad_norm": 0.0834188237786293, "kl": 0.0, "learning_rate": 9.592822636300898e-08, "loss": -0.001, "num_tokens": 72255360.0, "reward": 1.2666666507720947, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2393 }, { "completion_length": 724.0000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1193.0, "completions/max_terminated_length": 1193.0, "completions/mean_length": 724.0, "completions/mean_terminated_length": 724.0, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "epoch": 0.8120759837177748, "frac_reward_zero_std": 1.0, "grad_norm": 8.956681085692253e-08, "kl": 0.0, "learning_rate": 9.575569358178054e-08, "loss": 0.0, "num_tokens": 72276768.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2394 }, { "completion_length": 778.5833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 2950.0, "completions/max_terminated_length": 2950.0, "completions/mean_length": 778.5833740234375, "completions/mean_terminated_length": 778.5833740234375, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.812415196743555, "frac_reward_zero_std": 0.5, "grad_norm": 0.10332927852869034, "kl": 0.0, "learning_rate": 9.558316080055211e-08, "loss": 0.005, "num_tokens": 72299845.0, "reward": 1.2666666507720947, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2395 }, { "completion_length": 616.5, "completions/clipped_ratio": 0.0, "completions/max_length": 992.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 616.5, "completions/mean_terminated_length": 616.5, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.8127544097693351, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 9.541062801932366e-08, "loss": 0.0, "num_tokens": 72320263.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2396 }, { "completion_length": 2205.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 6580.0, "completions/max_terminated_length": 6580.0, "completions/mean_length": 2205.58349609375, "completions/mean_terminated_length": 2205.58349609375, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "epoch": 0.8130936227951153, "frac_reward_zero_std": 0.5, "grad_norm": 0.6527693867683411, "kl": 0.0, "learning_rate": 9.523809523809523e-08, "loss": -0.0123, "num_tokens": 72360650.0, "reward": 1.0, "reward_std": 0.21908903121948242, "rewards/correctness_reward_func/mean": 0.699999988079071, "rewards/correctness_reward_func/std": 0.43064433336257935, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2397 }, { "completion_length": 1845.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2818.0, "completions/max_terminated_length": 2818.0, "completions/mean_length": 1845.666748046875, "completions/mean_terminated_length": 1845.666748046875, "completions/min_length": 1197.0, "completions/min_terminated_length": 1197.0, "epoch": 0.8134328358208955, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 9.506556245686679e-08, "loss": 0.0, "num_tokens": 72394186.0, "reward": 0.30000001192092896, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2398 }, { "completion_length": 771.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1246.0, "completions/max_terminated_length": 1246.0, "completions/mean_length": 771.6666870117188, "completions/mean_terminated_length": 771.6666870117188, "completions/min_length": 493.0, "completions/min_terminated_length": 493.0, "epoch": 0.8137720488466758, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 9.489302967563836e-08, "loss": 0.0, "num_tokens": 72416328.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2399 }, { "completion_length": 1477.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2776.0, "completions/max_terminated_length": 2776.0, "completions/mean_length": 1477.8333740234375, "completions/mean_terminated_length": 1477.8333740234375, "completions/min_length": 614.0, "completions/min_terminated_length": 614.0, "epoch": 0.8141112618724559, "frac_reward_zero_std": 0.0, "grad_norm": 0.1832352876663208, "kl": 0.0, "learning_rate": 9.472049689440994e-08, "loss": 0.0005, "num_tokens": 72447142.0, "reward": 1.183333396911621, "reward_std": 0.10641201585531235, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2400 }, { "completion_length": 1102.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 3391.0, "completions/max_terminated_length": 3391.0, "completions/mean_length": 1102.3333740234375, "completions/mean_terminated_length": 1102.3333740234375, "completions/min_length": 451.0, "completions/min_terminated_length": 451.0, "epoch": 0.8144504748982361, "frac_reward_zero_std": 0.5, "grad_norm": 0.10044291615486145, "kl": 0.0, "learning_rate": 9.45479641131815e-08, "loss": -0.0047, "num_tokens": 72475532.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2401 }, { "completion_length": 591.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 865.0, "completions/max_terminated_length": 865.0, "completions/mean_length": 591.8333740234375, "completions/mean_terminated_length": 591.8333740234375, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.8147896879240163, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 9.437543133195307e-08, "loss": 0.0, "num_tokens": 72494472.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2402 }, { "completion_length": 1758.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 3527.0, "completions/max_terminated_length": 3527.0, "completions/mean_length": 1758.8333740234375, "completions/mean_terminated_length": 1758.8333740234375, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "epoch": 0.8151289009497965, "frac_reward_zero_std": 0.5, "grad_norm": 0.6734954714775085, "kl": 0.0, "learning_rate": 9.420289855072464e-08, "loss": 0.0178, "num_tokens": 72524722.0, "reward": 1.1166666746139526, "reward_std": 0.24832776188850403, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.3857303261756897, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2403 }, { "completion_length": 874.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2416.0, "completions/max_terminated_length": 2416.0, "completions/mean_length": 874.6666870117188, "completions/mean_terminated_length": 874.6666870117188, "completions/min_length": 423.0, "completions/min_terminated_length": 423.0, "epoch": 0.8154681139755766, "frac_reward_zero_std": 0.5, "grad_norm": 0.4046372175216675, "kl": 0.0, "learning_rate": 9.40303657694962e-08, "loss": 0.0063, "num_tokens": 72545034.0, "reward": 0.9166667461395264, "reward_std": 0.24013885855674744, "rewards/correctness_reward_func/mean": 0.6166666746139526, "rewards/correctness_reward_func/std": 0.37618499994277954, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2404 }, { "completion_length": 832.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1303.0, "completions/max_terminated_length": 1303.0, "completions/mean_length": 832.3333740234375, "completions/mean_terminated_length": 832.3333740234375, "completions/min_length": 513.0, "completions/min_terminated_length": 513.0, "epoch": 0.8158073270013568, "frac_reward_zero_std": 1.0, "grad_norm": 1.1565265367607935e-07, "kl": 0.0, "learning_rate": 9.385783298826777e-08, "loss": 0.0, "num_tokens": 72568618.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2405 }, { "completion_length": 1289.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2572.0, "completions/max_terminated_length": 2572.0, "completions/mean_length": 1289.0833740234375, "completions/mean_terminated_length": 1289.0833740234375, "completions/min_length": 725.0, "completions/min_terminated_length": 725.0, "epoch": 0.816146540027137, "frac_reward_zero_std": 0.0, "grad_norm": 0.11786416918039322, "kl": 0.0, "learning_rate": 9.368530020703933e-08, "loss": -0.0014, "num_tokens": 72598937.0, "reward": 1.25, "reward_std": 0.09246455878019333, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2406 }, { "completion_length": 824.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2160.0, "completions/max_terminated_length": 2160.0, "completions/mean_length": 824.5, "completions/mean_terminated_length": 824.5, "completions/min_length": 464.0, "completions/min_terminated_length": 464.0, "epoch": 0.8164857530529173, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 9.35127674258109e-08, "loss": 0.0, "num_tokens": 72617867.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2407 }, { "completion_length": 908.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1200.0, "completions/max_terminated_length": 1200.0, "completions/mean_length": 908.4166870117188, "completions/mean_terminated_length": 908.4166870117188, "completions/min_length": 598.0, "completions/min_terminated_length": 598.0, "epoch": 0.8168249660786974, "frac_reward_zero_std": 1.0, "grad_norm": 1.200104549070602e-07, "kl": 0.0, "learning_rate": 9.334023464458247e-08, "loss": 0.0, "num_tokens": 72636772.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2408 }, { "completion_length": 1319.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2594.0, "completions/max_terminated_length": 2594.0, "completions/mean_length": 1319.416748046875, "completions/mean_terminated_length": 1319.416748046875, "completions/min_length": 382.0, "completions/min_terminated_length": 382.0, "epoch": 0.8171641791044776, "frac_reward_zero_std": 0.5, "grad_norm": 0.09363001585006714, "kl": 0.0, "learning_rate": 9.316770186335403e-08, "loss": -0.0007, "num_tokens": 72663513.0, "reward": 1.1500000953674316, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.8500000834465027, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2409 }, { "completion_length": 762.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1145.0, "completions/max_terminated_length": 1145.0, "completions/mean_length": 762.9166870117188, "completions/mean_terminated_length": 762.9166870117188, "completions/min_length": 428.0, "completions/min_terminated_length": 428.0, "epoch": 0.8175033921302578, "frac_reward_zero_std": 1.0, "grad_norm": 2.24809468818421e-07, "kl": 0.0, "learning_rate": 9.29951690821256e-08, "loss": 0.0, "num_tokens": 72685580.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2410 }, { "completion_length": 2145.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 4293.0, "completions/max_terminated_length": 4293.0, "completions/mean_length": 2145.166748046875, "completions/mean_terminated_length": 2145.166748046875, "completions/min_length": 1171.0, "completions/min_terminated_length": 1171.0, "epoch": 0.817842605156038, "frac_reward_zero_std": 1.0, "grad_norm": 1.3578014090853685e-07, "kl": 0.0, "learning_rate": 9.282263630089717e-08, "loss": 0.0, "num_tokens": 72723958.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2411 }, { "completion_length": 799.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1558.0, "completions/max_terminated_length": 1558.0, "completions/mean_length": 799.75, "completions/mean_terminated_length": 799.75, "completions/min_length": 414.0, "completions/min_terminated_length": 414.0, "epoch": 0.8181818181818182, "frac_reward_zero_std": 0.5, "grad_norm": 0.2214116007089615, "kl": 0.0, "learning_rate": 9.265010351966873e-08, "loss": -0.0027, "num_tokens": 72742555.0, "reward": 1.133333444595337, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.2806917726993561, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2412 }, { "completion_length": 584.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 910.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 584.3333740234375, "completions/mean_terminated_length": 584.3333740234375, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.8185210312075983, "frac_reward_zero_std": 1.0, "grad_norm": 1.4558594330082997e-07, "kl": 0.0, "learning_rate": 9.24775707384403e-08, "loss": 0.0, "num_tokens": 72762491.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2413 }, { "completion_length": 698.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 860.0, "completions/max_terminated_length": 860.0, "completions/mean_length": 698.4166870117188, "completions/mean_terminated_length": 698.4166870117188, "completions/min_length": 471.0, "completions/min_terminated_length": 471.0, "epoch": 0.8188602442333786, "frac_reward_zero_std": 0.0, "grad_norm": 0.09449037909507751, "kl": 0.0, "learning_rate": 9.230503795721186e-08, "loss": -0.0006, "num_tokens": 72785416.0, "reward": 1.2000000476837158, "reward_std": 0.08164963126182556, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2414 }, { "completion_length": 777.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 2283.0, "completions/max_terminated_length": 2283.0, "completions/mean_length": 777.3333740234375, "completions/mean_terminated_length": 777.3333740234375, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "epoch": 0.8191994572591588, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 9.213250517598343e-08, "loss": 0.0, "num_tokens": 72805244.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2415 }, { "completion_length": 1558.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5003.0, "completions/max_terminated_length": 5003.0, "completions/mean_length": 1558.166748046875, "completions/mean_terminated_length": 1558.166748046875, "completions/min_length": 512.0, "completions/min_terminated_length": 512.0, "epoch": 0.819538670284939, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 9.195997239475501e-08, "loss": 0.0, "num_tokens": 72835192.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2416 }, { "completion_length": 951.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2114.0, "completions/max_terminated_length": 2114.0, "completions/mean_length": 951.6666870117188, "completions/mean_terminated_length": 951.6666870117188, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.8198778833107191, "frac_reward_zero_std": 1.0, "grad_norm": 1.5703493261298718e-07, "kl": 0.0, "learning_rate": 9.178743961352657e-08, "loss": 0.0, "num_tokens": 72855660.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2417 }, { "completion_length": 572.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 776.0, "completions/max_terminated_length": 776.0, "completions/mean_length": 572.25, "completions/mean_terminated_length": 572.25, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.8202170963364993, "frac_reward_zero_std": 0.5, "grad_norm": 0.26274237036705017, "kl": 0.0, "learning_rate": 9.161490683229814e-08, "loss": -0.0009, "num_tokens": 72876291.0, "reward": 0.6333333253860474, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.3333333432674408, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2418 }, { "completion_length": 705.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1044.0, "completions/max_terminated_length": 1044.0, "completions/mean_length": 705.75, "completions/mean_terminated_length": 705.75, "completions/min_length": 377.0, "completions/min_terminated_length": 377.0, "epoch": 0.8205563093622795, "frac_reward_zero_std": 0.5, "grad_norm": 0.06341227144002914, "kl": 0.0, "learning_rate": 9.14423740510697e-08, "loss": 0.0011, "num_tokens": 72895920.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2419 }, { "completion_length": 739.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1158.0, "completions/max_terminated_length": 1158.0, "completions/mean_length": 739.6666870117188, "completions/mean_terminated_length": 739.6666870117188, "completions/min_length": 446.0, "completions/min_terminated_length": 446.0, "epoch": 0.8208955223880597, "frac_reward_zero_std": 1.0, "grad_norm": 2.1254825810501643e-07, "kl": 0.0, "learning_rate": 9.126984126984126e-08, "loss": 0.0, "num_tokens": 72915524.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2420 }, { "completion_length": 593.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 890.0, "completions/max_terminated_length": 890.0, "completions/mean_length": 593.75, "completions/mean_terminated_length": 593.75, "completions/min_length": 430.0, "completions/min_terminated_length": 430.0, "epoch": 0.8212347354138398, "frac_reward_zero_std": 0.5, "grad_norm": 0.05123069882392883, "kl": 0.0, "learning_rate": 9.109730848861283e-08, "loss": -0.0004, "num_tokens": 72937919.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2421 }, { "completion_length": 717.0000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1107.0, "completions/max_terminated_length": 1107.0, "completions/mean_length": 717.0, "completions/mean_terminated_length": 717.0, "completions/min_length": 353.0, "completions/min_terminated_length": 353.0, "epoch": 0.8215739484396201, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 9.092477570738439e-08, "loss": 0.0, "num_tokens": 72959831.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2422 }, { "completion_length": 641.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1013.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 641.6666870117188, "completions/mean_terminated_length": 641.6666870117188, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "epoch": 0.8219131614654003, "frac_reward_zero_std": 0.5, "grad_norm": 0.4038350284099579, "kl": 0.0, "learning_rate": 9.075224292615596e-08, "loss": 0.0008, "num_tokens": 72981565.0, "reward": 1.0833334922790527, "reward_std": 0.19407902657985687, "rewards/correctness_reward_func/mean": 0.7833333015441895, "rewards/correctness_reward_func/std": 0.26227444410324097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2423 }, { "completion_length": 851.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1529.0, "completions/max_terminated_length": 1529.0, "completions/mean_length": 851.6666870117188, "completions/mean_terminated_length": 851.6666870117188, "completions/min_length": 560.0, "completions/min_terminated_length": 560.0, "epoch": 0.8222523744911805, "frac_reward_zero_std": 0.0, "grad_norm": 0.3316085636615753, "kl": 0.0, "learning_rate": 9.057971014492754e-08, "loss": 0.0007, "num_tokens": 72999627.0, "reward": 1.0333333015441895, "reward_std": 0.2581988573074341, "rewards/correctness_reward_func/mean": 0.7333332896232605, "rewards/correctness_reward_func/std": 0.3550501763820648, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2424 }, { "completion_length": 620.5833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1054.0, "completions/max_terminated_length": 1054.0, "completions/mean_length": 620.5833740234375, "completions/mean_terminated_length": 620.5833740234375, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "epoch": 0.8225915875169606, "frac_reward_zero_std": 1.0, "grad_norm": 2.0034275394209544e-07, "kl": 0.0, "learning_rate": 9.04071773636991e-08, "loss": 0.0, "num_tokens": 73021318.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2425 }, { "completion_length": 764.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1308.0, "completions/max_terminated_length": 1308.0, "completions/mean_length": 764.8333740234375, "completions/mean_terminated_length": 764.8333740234375, "completions/min_length": 407.0, "completions/min_terminated_length": 407.0, "epoch": 0.8229308005427408, "frac_reward_zero_std": 1.0, "grad_norm": 2.1904392610849754e-07, "kl": 0.0, "learning_rate": 9.023464458247067e-08, "loss": 0.0, "num_tokens": 73039640.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2426 }, { "completion_length": 1704.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 4286.0, "completions/max_terminated_length": 4286.0, "completions/mean_length": 1704.75, "completions/mean_terminated_length": 1704.75, "completions/min_length": 486.0, "completions/min_terminated_length": 486.0, "epoch": 0.823270013568521, "frac_reward_zero_std": 0.5, "grad_norm": 0.5144331455230713, "kl": 0.0, "learning_rate": 9.006211180124224e-08, "loss": 0.0146, "num_tokens": 73074647.0, "reward": 1.0333335399627686, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.7333333492279053, "rewards/correctness_reward_func/std": 0.23094011843204498, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2427 }, { "completion_length": 2194.3334350585938, "completions/clipped_ratio": 0.0, "completions/max_length": 3089.0, "completions/max_terminated_length": 3089.0, "completions/mean_length": 2194.33349609375, "completions/mean_terminated_length": 2194.33349609375, "completions/min_length": 1146.0, "completions/min_terminated_length": 1146.0, "epoch": 0.8236092265943012, "frac_reward_zero_std": 0.0, "grad_norm": 0.667299211025238, "kl": 0.0, "learning_rate": 8.98895790200138e-08, "loss": -0.0076, "num_tokens": 73111491.0, "reward": 0.8833333849906921, "reward_std": 0.4342670440673828, "rewards/correctness_reward_func/mean": 0.5833333134651184, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2428 }, { "completion_length": 1759.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3586.0, "completions/max_terminated_length": 3586.0, "completions/mean_length": 1759.0, "completions/mean_terminated_length": 1759.0, "completions/min_length": 634.0, "completions/min_terminated_length": 634.0, "epoch": 0.8239484396200815, "frac_reward_zero_std": 0.0, "grad_norm": 0.1726318895816803, "kl": 0.0, "learning_rate": 8.971704623878537e-08, "loss": -0.0019, "num_tokens": 73144587.0, "reward": 1.183333396911621, "reward_std": 0.10641198605298996, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2429 }, { "completion_length": 2710.166748046875, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6214.0, "completions/mean_length": 3259.25, "completions/mean_terminated_length": 2956.545654296875, "completions/min_length": 659.0, "completions/min_terminated_length": 659.0, "epoch": 0.8242876526458616, "frac_reward_zero_std": 0.0, "grad_norm": 0.6384265422821045, "kl": NaN, "learning_rate": 8.954451345755692e-08, "loss": -0.0051, "num_tokens": 73187843.0, "reward": 0.8333333730697632, "reward_std": 0.2588963508605957, "rewards/correctness_reward_func/mean": 0.5333333611488342, "rewards/correctness_reward_func/std": 0.47736650705337524, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2430 }, { "completion_length": 1214.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3062.0, "completions/max_terminated_length": 3062.0, "completions/mean_length": 1214.75, "completions/mean_terminated_length": 1214.75, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 0.8246268656716418, "frac_reward_zero_std": 1.0, "grad_norm": 3.626901445841213e-07, "kl": 0.0, "learning_rate": 8.937198067632849e-08, "loss": 0.0, "num_tokens": 73215374.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2431 }, { "completion_length": 2397.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 5449.0, "completions/max_terminated_length": 5449.0, "completions/mean_length": 2397.58349609375, "completions/mean_terminated_length": 2397.58349609375, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.824966078697422, "frac_reward_zero_std": 1.0, "grad_norm": 4.143779506193823e-07, "kl": 0.0, "learning_rate": 8.919944789510007e-08, "loss": 0.0, "num_tokens": 73251273.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2432 }, { "completion_length": 1050.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 1678.0, "completions/max_terminated_length": 1678.0, "completions/mean_length": 1050.25, "completions/mean_terminated_length": 1050.25, "completions/min_length": 501.0, "completions/min_terminated_length": 501.0, "epoch": 0.8253052917232022, "frac_reward_zero_std": 0.5, "grad_norm": 0.3289167881011963, "kl": 0.0, "learning_rate": 8.902691511387163e-08, "loss": -0.0065, "num_tokens": 73276032.0, "reward": 0.9541667699813843, "reward_std": 0.22716552019119263, "rewards/correctness_reward_func/mean": 0.6666666865348816, "rewards/correctness_reward_func/std": 0.31139957904815674, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 2433 }, { "completion_length": 2027.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 4044.0, "completions/max_terminated_length": 4044.0, "completions/mean_length": 2027.916748046875, "completions/mean_terminated_length": 2027.916748046875, "completions/min_length": 480.0, "completions/min_terminated_length": 480.0, "epoch": 0.8256445047489823, "frac_reward_zero_std": 0.5, "grad_norm": 0.6313930153846741, "kl": 0.0, "learning_rate": 8.88543823326432e-08, "loss": 0.0125, "num_tokens": 73313291.0, "reward": 0.5666667222976685, "reward_std": 0.20655910670757294, "rewards/correctness_reward_func/mean": 0.2666666805744171, "rewards/correctness_reward_func/std": 0.393892765045166, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2434 }, { "completion_length": 878.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 2273.0, "completions/max_terminated_length": 2273.0, "completions/mean_length": 878.75, "completions/mean_terminated_length": 878.75, "completions/min_length": 483.0, "completions/min_terminated_length": 483.0, "epoch": 0.8259837177747625, "frac_reward_zero_std": 1.0, "grad_norm": 1.5735450631382264e-07, "kl": 0.0, "learning_rate": 8.868184955141476e-08, "loss": 0.0, "num_tokens": 73334474.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2435 }, { "completion_length": 3119.33349609375, "completions/clipped_ratio": 0.0, "completions/max_length": 5635.0, "completions/max_terminated_length": 5635.0, "completions/mean_length": 3119.33349609375, "completions/mean_terminated_length": 3119.33349609375, "completions/min_length": 1209.0, "completions/min_terminated_length": 1209.0, "epoch": 0.8263229308005428, "frac_reward_zero_std": 0.5, "grad_norm": 0.12345325201749802, "kl": 0.0, "learning_rate": 8.850931677018633e-08, "loss": -0.0002, "num_tokens": 73384326.0, "reward": 0.7333333492279053, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.43333330750465393, "rewards/correctness_reward_func/std": 0.45792683959007263, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2436 }, { "completion_length": 848.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2082.0, "completions/max_terminated_length": 2082.0, "completions/mean_length": 848.9166870117188, "completions/mean_terminated_length": 848.9166870117188, "completions/min_length": 527.0, "completions/min_terminated_length": 527.0, "epoch": 0.826662143826323, "frac_reward_zero_std": 0.5, "grad_norm": 0.05949406325817108, "kl": 0.0, "learning_rate": 8.83367839889579e-08, "loss": 0.0001, "num_tokens": 73404659.0, "reward": 0.7166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2437 }, { "completion_length": 681.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1135.0, "completions/max_terminated_length": 1135.0, "completions/mean_length": 681.5833740234375, "completions/mean_terminated_length": 681.5833740234375, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "epoch": 0.8270013568521031, "frac_reward_zero_std": 0.5, "grad_norm": 0.07842101156711578, "kl": 0.0, "learning_rate": 8.816425120772946e-08, "loss": -0.0001, "num_tokens": 73427184.0, "reward": 1.2333333492279053, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.9333333373069763, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2438 }, { "completion_length": 1209.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2684.0, "completions/max_terminated_length": 2684.0, "completions/mean_length": 1209.25, "completions/mean_terminated_length": 1209.25, "completions/min_length": 448.0, "completions/min_terminated_length": 448.0, "epoch": 0.8273405698778833, "frac_reward_zero_std": 0.5, "grad_norm": 0.7383807301521301, "kl": 0.0, "learning_rate": 8.799171842650104e-08, "loss": -0.0125, "num_tokens": 73451847.0, "reward": 0.38333332538604736, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.0833333358168602, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2439 }, { "completion_length": 2113.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 6168.0, "completions/max_terminated_length": 6168.0, "completions/mean_length": 2113.416748046875, "completions/mean_terminated_length": 2113.416748046875, "completions/min_length": 712.0, "completions/min_terminated_length": 712.0, "epoch": 0.8276797829036635, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 8.781918564527261e-08, "loss": 0.0, "num_tokens": 73493870.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2440 }, { "completion_length": 2047.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6136.0, "completions/max_terminated_length": 6136.0, "completions/mean_length": 2047.0, "completions/mean_terminated_length": 2047.0, "completions/min_length": 667.0, "completions/min_terminated_length": 667.0, "epoch": 0.8280189959294437, "frac_reward_zero_std": 0.5, "grad_norm": 0.10474789887666702, "kl": 0.0, "learning_rate": 8.764665286404417e-08, "loss": -0.0027, "num_tokens": 73530332.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2441 }, { "completion_length": 1404.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 3805.0, "completions/max_terminated_length": 3805.0, "completions/mean_length": 1404.25, "completions/mean_terminated_length": 1404.25, "completions/min_length": 532.0, "completions/min_terminated_length": 532.0, "epoch": 0.8283582089552238, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 8.747412008281573e-08, "loss": 0.0, "num_tokens": 73560449.0, "reward": 0.30000001192092896, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2442 }, { "completion_length": 1886.8333740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5931.0, "completions/mean_length": 2435.916748046875, "completions/mean_terminated_length": 2058.36376953125, "completions/min_length": 596.0, "completions/min_terminated_length": 596.0, "epoch": 0.828697421981004, "frac_reward_zero_std": 0.0, "grad_norm": 0.6622955799102783, "kl": NaN, "learning_rate": 8.730158730158729e-08, "loss": -0.026, "num_tokens": 73596303.0, "reward": 1.0333333015441895, "reward_std": 0.2581988573074341, "rewards/correctness_reward_func/mean": 0.7333333492279053, "rewards/correctness_reward_func/std": 0.35505014657974243, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2443 }, { "completion_length": 742.0000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1033.0, "completions/max_terminated_length": 1033.0, "completions/mean_length": 742.0, "completions/mean_terminated_length": 742.0, "completions/min_length": 538.0, "completions/min_terminated_length": 538.0, "epoch": 0.8290366350067843, "frac_reward_zero_std": 1.0, "grad_norm": 1.6147606629601796e-07, "kl": 0.0, "learning_rate": 8.712905452035886e-08, "loss": 0.0, "num_tokens": 73619391.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2444 }, { "completion_length": 1147.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2392.0, "completions/max_terminated_length": 2392.0, "completions/mean_length": 1147.8333740234375, "completions/mean_terminated_length": 1147.8333740234375, "completions/min_length": 627.0, "completions/min_terminated_length": 627.0, "epoch": 0.8293758480325645, "frac_reward_zero_std": 1.0, "grad_norm": 1.8881665653225355e-07, "kl": 0.0, "learning_rate": 8.695652173913042e-08, "loss": 0.0, "num_tokens": 73647475.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2445 }, { "completion_length": 1699.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3291.0, "completions/max_terminated_length": 3291.0, "completions/mean_length": 1699.5, "completions/mean_terminated_length": 1699.5, "completions/min_length": 603.0, "completions/min_terminated_length": 603.0, "epoch": 0.8297150610583447, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 8.678398895790199e-08, "loss": 0.0, "num_tokens": 73679875.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2446 }, { "completion_length": 1114.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2109.0, "completions/max_terminated_length": 2109.0, "completions/mean_length": 1114.3333740234375, "completions/mean_terminated_length": 1114.3333740234375, "completions/min_length": 672.0, "completions/min_terminated_length": 672.0, "epoch": 0.8300542740841248, "frac_reward_zero_std": 0.5, "grad_norm": 0.09982907027006149, "kl": 0.0, "learning_rate": 8.661145617667357e-08, "loss": -0.0016, "num_tokens": 73706183.0, "reward": 1.25, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2447 }, { "completion_length": 1275.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2797.0, "completions/max_terminated_length": 2797.0, "completions/mean_length": 1275.3333740234375, "completions/mean_terminated_length": 1275.3333740234375, "completions/min_length": 586.0, "completions/min_terminated_length": 586.0, "epoch": 0.830393487109905, "frac_reward_zero_std": 0.5, "grad_norm": 0.6221510767936707, "kl": 0.0, "learning_rate": 8.643892339544514e-08, "loss": -0.0078, "num_tokens": 73738119.0, "reward": 1.133333444595337, "reward_std": 0.25819891691207886, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.3892494738101959, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2448 }, { "completion_length": 952.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1528.0, "completions/max_terminated_length": 1528.0, "completions/mean_length": 952.9166870117188, "completions/mean_terminated_length": 952.9166870117188, "completions/min_length": 480.0, "completions/min_terminated_length": 480.0, "epoch": 0.8307327001356852, "frac_reward_zero_std": 0.5, "grad_norm": 0.49503713846206665, "kl": 0.0, "learning_rate": 8.62663906142167e-08, "loss": -0.003, "num_tokens": 73760324.0, "reward": 0.9333333373069763, "reward_std": 0.20655912160873413, "rewards/correctness_reward_func/mean": 0.6333333849906921, "rewards/correctness_reward_func/std": 0.4735424220561981, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2449 }, { "completion_length": 623.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1516.0, "completions/max_terminated_length": 1516.0, "completions/mean_length": 623.9166870117188, "completions/mean_terminated_length": 623.9166870117188, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "epoch": 0.8310719131614654, "frac_reward_zero_std": 0.5, "grad_norm": 0.3094821870326996, "kl": 0.0, "learning_rate": 8.609385783298827e-08, "loss": -0.0015, "num_tokens": 73775383.0, "reward": 1.1041667461395264, "reward_std": 0.23474274575710297, "rewards/correctness_reward_func/mean": 0.8166666030883789, "rewards/correctness_reward_func/std": 0.27579087018966675, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 2450 }, { "completion_length": 536.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 689.0, "completions/max_terminated_length": 689.0, "completions/mean_length": 536.3333740234375, "completions/mean_terminated_length": 536.3333740234375, "completions/min_length": 366.0, "completions/min_terminated_length": 366.0, "epoch": 0.8314111261872456, "frac_reward_zero_std": 0.5, "grad_norm": 0.05547710880637169, "kl": 0.0, "learning_rate": 8.592132505175983e-08, "loss": 0.0004, "num_tokens": 73794923.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2451 }, { "completion_length": 1248.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2388.0, "completions/max_terminated_length": 2388.0, "completions/mean_length": 1248.25, "completions/mean_terminated_length": 1248.25, "completions/min_length": 476.0, "completions/min_terminated_length": 476.0, "epoch": 0.8317503392130258, "frac_reward_zero_std": 0.5, "grad_norm": 0.30352479219436646, "kl": 0.0, "learning_rate": 8.57487922705314e-08, "loss": -0.001, "num_tokens": 73823762.0, "reward": 0.9000000953674316, "reward_std": 0.21908903121948242, "rewards/correctness_reward_func/mean": 0.6000000238418579, "rewards/correctness_reward_func/std": 0.36181360483169556, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2452 }, { "completion_length": 762.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1295.0, "completions/max_terminated_length": 1295.0, "completions/mean_length": 762.25, "completions/mean_terminated_length": 762.25, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "epoch": 0.832089552238806, "frac_reward_zero_std": 0.5, "grad_norm": 0.06194649264216423, "kl": 0.0, "learning_rate": 8.557625948930295e-08, "loss": -0.0002, "num_tokens": 73845527.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2453 }, { "completion_length": 1017.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2641.0, "completions/max_terminated_length": 2641.0, "completions/mean_length": 1017.0833740234375, "completions/mean_terminated_length": 1017.0833740234375, "completions/min_length": 456.0, "completions/min_terminated_length": 456.0, "epoch": 0.8324287652645862, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 8.540372670807452e-08, "loss": 0.0, "num_tokens": 73871646.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2454 }, { "completion_length": 960.1667175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 2795.0, "completions/max_terminated_length": 2795.0, "completions/mean_length": 960.1666870117188, "completions/mean_terminated_length": 960.1666870117188, "completions/min_length": 480.0, "completions/min_terminated_length": 480.0, "epoch": 0.8327679782903663, "frac_reward_zero_std": 0.5, "grad_norm": 0.5198352932929993, "kl": 0.0, "learning_rate": 8.52311939268461e-08, "loss": 0.008, "num_tokens": 73892864.0, "reward": 0.9833334684371948, "reward_std": 0.222860187292099, "rewards/correctness_reward_func/mean": 0.6833333969116211, "rewards/correctness_reward_func/std": 0.32427075505256653, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2455 }, { "completion_length": 1836.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3735.0, "completions/max_terminated_length": 3735.0, "completions/mean_length": 1836.0, "completions/mean_terminated_length": 1836.0, "completions/min_length": 778.0, "completions/min_terminated_length": 778.0, "epoch": 0.8331071913161465, "frac_reward_zero_std": 0.5, "grad_norm": 0.26148954033851624, "kl": 0.0, "learning_rate": 8.505866114561766e-08, "loss": -0.0046, "num_tokens": 73927268.0, "reward": 0.6333333253860474, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.3333333432674408, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2456 }, { "completion_length": 733.5000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1507.0, "completions/max_terminated_length": 1507.0, "completions/mean_length": 733.5, "completions/mean_terminated_length": 733.5, "completions/min_length": 412.0, "completions/min_terminated_length": 412.0, "epoch": 0.8334464043419267, "frac_reward_zero_std": 0.5, "grad_norm": 0.07438584417104721, "kl": 0.0, "learning_rate": 8.488612836438923e-08, "loss": 0.0003, "num_tokens": 73948862.0, "reward": 0.7333333492279053, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.43333330750465393, "rewards/correctness_reward_func/std": 0.45792683959007263, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2457 }, { "completion_length": 1497.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2503.0, "completions/max_terminated_length": 2503.0, "completions/mean_length": 1497.5, "completions/mean_terminated_length": 1497.5, "completions/min_length": 928.0, "completions/min_terminated_length": 928.0, "epoch": 0.833785617367707, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 8.47135955831608e-08, "loss": 0.0, "num_tokens": 73980770.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2458 }, { "completion_length": 623.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1281.0, "completions/max_terminated_length": 1281.0, "completions/mean_length": 623.1666870117188, "completions/mean_terminated_length": 623.1666870117188, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.8341248303934871, "frac_reward_zero_std": 0.5, "grad_norm": 0.12110241502523422, "kl": 0.0, "learning_rate": 8.454106280193236e-08, "loss": 0.0009, "num_tokens": 73999450.0, "reward": 1.2666666507720947, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2459 }, { "completion_length": 1268.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3504.0, "completions/max_terminated_length": 3504.0, "completions/mean_length": 1268.416748046875, "completions/mean_terminated_length": 1268.416748046875, "completions/min_length": 444.0, "completions/min_terminated_length": 444.0, "epoch": 0.8344640434192673, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 8.436853002070393e-08, "loss": 0.0, "num_tokens": 74025315.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2460 }, { "completion_length": 1610.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3500.0, "completions/max_terminated_length": 3500.0, "completions/mean_length": 1610.916748046875, "completions/mean_terminated_length": 1610.916748046875, "completions/min_length": 414.0, "completions/min_terminated_length": 414.0, "epoch": 0.8348032564450475, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 8.41959972394755e-08, "loss": 0.0, "num_tokens": 74059766.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2461 }, { "completion_length": 1382.8333740234375, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 4060.0, "completions/mean_length": 2481.0, "completions/mean_terminated_length": 1659.4000244140625, "completions/min_length": 415.0, "completions/min_terminated_length": 415.0, "epoch": 0.8351424694708277, "frac_reward_zero_std": 0.0, "grad_norm": 0.13119420409202576, "kl": NaN, "learning_rate": 8.402346445824707e-08, "loss": -0.0153, "num_tokens": 74088996.0, "reward": 0.7333333492279053, "reward_std": 0.11828448623418808, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.2500000298023224, "rewards/format_reward_func/std": 0.11677484214305878, "step": 2462 }, { "completion_length": 1573.4166870117188, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4166.0, "completions/mean_length": 2122.5, "completions/mean_terminated_length": 1716.45458984375, "completions/min_length": 479.0, "completions/min_terminated_length": 479.0, "epoch": 0.8354816824966079, "frac_reward_zero_std": 0.5, "grad_norm": 0.6057560443878174, "kl": NaN, "learning_rate": 8.385093167701864e-08, "loss": 0.0012, "num_tokens": 74119283.0, "reward": 1.008333444595337, "reward_std": 0.3006936311721802, "rewards/correctness_reward_func/mean": 0.7333332896232605, "rewards/correctness_reward_func/std": 0.35505014657974243, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 2463 }, { "completion_length": 845.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1404.0, "completions/max_terminated_length": 1404.0, "completions/mean_length": 845.4166870117188, "completions/mean_terminated_length": 845.4166870117188, "completions/min_length": 525.0, "completions/min_terminated_length": 525.0, "epoch": 0.835820895522388, "frac_reward_zero_std": 1.0, "grad_norm": 1.8278333868693153e-07, "kl": 0.0, "learning_rate": 8.36783988957902e-08, "loss": 0.0, "num_tokens": 74144572.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2464 }, { "completion_length": 1578.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3968.0, "completions/max_terminated_length": 3968.0, "completions/mean_length": 1578.8333740234375, "completions/mean_terminated_length": 1578.8333740234375, "completions/min_length": 518.0, "completions/min_terminated_length": 518.0, "epoch": 0.8361601085481682, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 8.350586611456176e-08, "loss": 0.0, "num_tokens": 74171372.0, "reward": 0.30000001192092896, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2465 }, { "completion_length": 2119.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3760.0, "completions/max_terminated_length": 3760.0, "completions/mean_length": 2119.0, "completions/mean_terminated_length": 2119.0, "completions/min_length": 988.0, "completions/min_terminated_length": 988.0, "epoch": 0.8364993215739485, "frac_reward_zero_std": 0.5, "grad_norm": 0.3796434700489044, "kl": 0.0, "learning_rate": 8.333333333333333e-08, "loss": 0.0009, "num_tokens": 74209136.0, "reward": 0.8666666746139526, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.5666666626930237, "rewards/correctness_reward_func/std": 0.5033223032951355, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2466 }, { "completion_length": 1142.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3404.0, "completions/max_terminated_length": 3404.0, "completions/mean_length": 1142.666748046875, "completions/mean_terminated_length": 1142.666748046875, "completions/min_length": 366.0, "completions/min_terminated_length": 366.0, "epoch": 0.8368385345997287, "frac_reward_zero_std": 0.0, "grad_norm": 0.16888898611068726, "kl": 0.0, "learning_rate": 8.316080055210489e-08, "loss": 0.0072, "num_tokens": 74238436.0, "reward": 1.2166666984558105, "reward_std": 0.10641198605298996, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298573225736618, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2467 }, { "completion_length": 939.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1497.0, "completions/max_terminated_length": 1497.0, "completions/mean_length": 939.0, "completions/mean_terminated_length": 939.0, "completions/min_length": 704.0, "completions/min_terminated_length": 704.0, "epoch": 0.8371777476255088, "frac_reward_zero_std": 1.0, "grad_norm": 1.2496100509906682e-07, "kl": 0.0, "learning_rate": 8.298826777087646e-08, "loss": 0.0, "num_tokens": 74261872.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2468 }, { "completion_length": 1220.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2718.0, "completions/max_terminated_length": 2718.0, "completions/mean_length": 1220.25, "completions/mean_terminated_length": 1220.25, "completions/min_length": 478.0, "completions/min_terminated_length": 478.0, "epoch": 0.837516960651289, "frac_reward_zero_std": 0.5, "grad_norm": 0.05765094980597496, "kl": 0.0, "learning_rate": 8.281573498964802e-08, "loss": -0.0003, "num_tokens": 74291575.0, "reward": 0.7833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2469 }, { "completion_length": 1429.416748046875, "completions/clipped_ratio": 0.41666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 6488.0, "completions/mean_length": 4174.83349609375, "completions/mean_terminated_length": 2450.4287109375, "completions/min_length": 1060.0, "completions/min_terminated_length": 1060.0, "epoch": 0.8378561736770692, "frac_reward_zero_std": 0.5, "grad_norm": 0.12614482641220093, "kl": NaN, "learning_rate": 8.26432022084196e-08, "loss": -0.0103, "num_tokens": 74318562.0, "reward": 0.16250000894069672, "reward_std": 0.03061862289905548, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.16250000894069672, "rewards/format_reward_func/std": 0.14943073689937592, "step": 2470 }, { "completion_length": 1021.0833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 2312.0, "completions/max_terminated_length": 2312.0, "completions/mean_length": 1021.0833740234375, "completions/mean_terminated_length": 1021.0833740234375, "completions/min_length": 688.0, "completions/min_terminated_length": 688.0, "epoch": 0.8381953867028494, "frac_reward_zero_std": 1.0, "grad_norm": 1.005433176715087e-07, "kl": 0.0, "learning_rate": 8.247066942719117e-08, "loss": 0.0, "num_tokens": 74338915.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2471 }, { "completion_length": 974.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1846.0, "completions/max_terminated_length": 1846.0, "completions/mean_length": 974.0833740234375, "completions/mean_terminated_length": 974.0833740234375, "completions/min_length": 453.0, "completions/min_terminated_length": 453.0, "epoch": 0.8385345997286295, "frac_reward_zero_std": 0.0, "grad_norm": 0.40504151582717896, "kl": 0.0, "learning_rate": 8.229813664596273e-08, "loss": 0.0009, "num_tokens": 74360228.0, "reward": 1.2000000476837158, "reward_std": 0.24494895339012146, "rewards/correctness_reward_func/mean": 0.9000000357627869, "rewards/correctness_reward_func/std": 0.28919950127601624, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2472 }, { "completion_length": 815.0833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1414.0, "completions/max_terminated_length": 1414.0, "completions/mean_length": 815.0833740234375, "completions/mean_terminated_length": 815.0833740234375, "completions/min_length": 466.0, "completions/min_terminated_length": 466.0, "epoch": 0.8388738127544098, "frac_reward_zero_std": 1.0, "grad_norm": 1.062460270873089e-07, "kl": 0.0, "learning_rate": 8.21256038647343e-08, "loss": 0.0, "num_tokens": 74382315.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2473 }, { "completion_length": 1318.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2841.0, "completions/max_terminated_length": 2841.0, "completions/mean_length": 1318.666748046875, "completions/mean_terminated_length": 1318.666748046875, "completions/min_length": 556.0, "completions/min_terminated_length": 556.0, "epoch": 0.83921302578019, "frac_reward_zero_std": 0.5, "grad_norm": 0.08853453397750854, "kl": 0.0, "learning_rate": 8.195307108350587e-08, "loss": -0.0013, "num_tokens": 74407547.0, "reward": 0.7333333492279053, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.43333330750465393, "rewards/correctness_reward_func/std": 0.45792683959007263, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2474 }, { "completion_length": 1065.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2731.0, "completions/max_terminated_length": 2731.0, "completions/mean_length": 1065.666748046875, "completions/mean_terminated_length": 1065.666748046875, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "epoch": 0.8395522388059702, "frac_reward_zero_std": 1.0, "grad_norm": 1.6640076694329764e-07, "kl": 0.0, "learning_rate": 8.178053830227743e-08, "loss": 0.0, "num_tokens": 74433985.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2475 }, { "completion_length": 602.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 736.0, "completions/max_terminated_length": 736.0, "completions/mean_length": 602.1666870117188, "completions/mean_terminated_length": 602.1666870117188, "completions/min_length": 492.0, "completions/min_terminated_length": 492.0, "epoch": 0.8398914518317503, "frac_reward_zero_std": 1.0, "grad_norm": 1.3334225457128923e-07, "kl": 0.0, "learning_rate": 8.160800552104899e-08, "loss": 0.0, "num_tokens": 74454459.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2476 }, { "completion_length": 821.0000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1485.0, "completions/max_terminated_length": 1485.0, "completions/mean_length": 821.0, "completions/mean_terminated_length": 821.0, "completions/min_length": 561.0, "completions/min_terminated_length": 561.0, "epoch": 0.8402306648575305, "frac_reward_zero_std": 0.5, "grad_norm": 0.08351096510887146, "kl": 0.0, "learning_rate": 8.143547273982055e-08, "loss": -0.0, "num_tokens": 74476095.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2477 }, { "completion_length": 2182.0001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 4133.0, "completions/max_terminated_length": 4133.0, "completions/mean_length": 2182.0, "completions/mean_terminated_length": 2182.0, "completions/min_length": 1008.0, "completions/min_terminated_length": 1008.0, "epoch": 0.8405698778833107, "frac_reward_zero_std": 0.5, "grad_norm": 0.09407032281160355, "kl": 0.0, "learning_rate": 8.126293995859213e-08, "loss": -0.0004, "num_tokens": 74512233.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2478 }, { "completion_length": 1605.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 4001.0, "completions/max_terminated_length": 4001.0, "completions/mean_length": 1605.5, "completions/mean_terminated_length": 1605.5, "completions/min_length": 443.0, "completions/min_terminated_length": 443.0, "epoch": 0.8409090909090909, "frac_reward_zero_std": 0.0, "grad_norm": 0.1582781970500946, "kl": 0.0, "learning_rate": 8.10904071773637e-08, "loss": 0.0039, "num_tokens": 74544591.0, "reward": 1.2333333492279053, "reward_std": 0.0955970510840416, "rewards/correctness_reward_func/mean": 0.9333333373069763, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2479 }, { "completion_length": 733.0833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1362.0, "completions/max_terminated_length": 1362.0, "completions/mean_length": 733.0833740234375, "completions/mean_terminated_length": 733.0833740234375, "completions/min_length": 436.0, "completions/min_terminated_length": 436.0, "epoch": 0.841248303934871, "frac_reward_zero_std": 1.0, "grad_norm": 1.4110661084032472e-07, "kl": 0.0, "learning_rate": 8.091787439613526e-08, "loss": 0.0, "num_tokens": 74567080.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2480 }, { "completion_length": 2215.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4005.0, "completions/max_terminated_length": 4005.0, "completions/mean_length": 2215.33349609375, "completions/mean_terminated_length": 2215.33349609375, "completions/min_length": 1173.0, "completions/min_terminated_length": 1173.0, "epoch": 0.8415875169606513, "frac_reward_zero_std": 0.5, "grad_norm": 0.39150166511535645, "kl": 0.0, "learning_rate": 8.074534161490683e-08, "loss": 0.0034, "num_tokens": 74605196.0, "reward": 1.133333444595337, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.2806917726993561, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2481 }, { "completion_length": 2240.8334350585938, "completions/clipped_ratio": 0.0, "completions/max_length": 5810.0, "completions/max_terminated_length": 5810.0, "completions/mean_length": 2240.83349609375, "completions/mean_terminated_length": 2240.83349609375, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "epoch": 0.8419267299864315, "frac_reward_zero_std": 0.5, "grad_norm": 0.8002076745033264, "kl": 0.0, "learning_rate": 8.05728088336784e-08, "loss": -0.0485, "num_tokens": 74644902.0, "reward": 0.9500000476837158, "reward_std": 0.27386125922203064, "rewards/correctness_reward_func/mean": 0.6499999761581421, "rewards/correctness_reward_func/std": 0.40113475918769836, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2482 }, { "completion_length": 1107.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2171.0, "completions/max_terminated_length": 2171.0, "completions/mean_length": 1107.3333740234375, "completions/mean_terminated_length": 1107.3333740234375, "completions/min_length": 575.0, "completions/min_terminated_length": 575.0, "epoch": 0.8422659430122117, "frac_reward_zero_std": 0.0, "grad_norm": 0.16217297315597534, "kl": 0.0, "learning_rate": 8.040027605244996e-08, "loss": 0.0024, "num_tokens": 74671408.0, "reward": 1.2000000476837158, "reward_std": 0.10954447835683823, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2483 }, { "completion_length": 746.0833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1402.0, "completions/max_terminated_length": 1402.0, "completions/mean_length": 746.0833740234375, "completions/mean_terminated_length": 746.0833740234375, "completions/min_length": 547.0, "completions/min_terminated_length": 547.0, "epoch": 0.8426051560379919, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 8.022774327122153e-08, "loss": 0.0, "num_tokens": 74690171.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2484 }, { "completion_length": 682.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 926.0, "completions/max_terminated_length": 926.0, "completions/mean_length": 682.1666870117188, "completions/mean_terminated_length": 682.1666870117188, "completions/min_length": 512.0, "completions/min_terminated_length": 512.0, "epoch": 0.842944369063772, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 8.00552104899931e-08, "loss": 0.0, "num_tokens": 74709751.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2485 }, { "completion_length": 574.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 791.0, "completions/max_terminated_length": 791.0, "completions/mean_length": 574.9166870117188, "completions/mean_terminated_length": 574.9166870117188, "completions/min_length": 453.0, "completions/min_terminated_length": 453.0, "epoch": 0.8432835820895522, "frac_reward_zero_std": 0.5, "grad_norm": 0.06882691383361816, "kl": 0.0, "learning_rate": 7.988267770876467e-08, "loss": 0.0001, "num_tokens": 74722788.0, "reward": 0.7333333492279053, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.43333330750465393, "rewards/correctness_reward_func/std": 0.45792683959007263, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2486 }, { "completion_length": 789.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1483.0, "completions/max_terminated_length": 1483.0, "completions/mean_length": 789.4166870117188, "completions/mean_terminated_length": 789.4166870117188, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "epoch": 0.8436227951153324, "frac_reward_zero_std": 0.5, "grad_norm": 0.067764513194561, "kl": 0.0, "learning_rate": 7.971014492753623e-08, "loss": 0.0016, "num_tokens": 74745797.0, "reward": 1.133333444595337, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2487 }, { "completion_length": 601.0000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1531.0, "completions/max_terminated_length": 1531.0, "completions/mean_length": 601.0, "completions/mean_terminated_length": 601.0, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.8439620081411127, "frac_reward_zero_std": 1.0, "grad_norm": 1.0926159177415684e-07, "kl": 0.0, "learning_rate": 7.953761214630779e-08, "loss": 0.0, "num_tokens": 74765615.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2488 }, { "completion_length": 1278.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2056.0, "completions/max_terminated_length": 2056.0, "completions/mean_length": 1278.5833740234375, "completions/mean_terminated_length": 1278.5833740234375, "completions/min_length": 623.0, "completions/min_terminated_length": 623.0, "epoch": 0.8443012211668928, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 7.936507936507936e-08, "loss": 0.0, "num_tokens": 74788818.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2489 }, { "completion_length": 820.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1973.0, "completions/max_terminated_length": 1973.0, "completions/mean_length": 820.25, "completions/mean_terminated_length": 820.25, "completions/min_length": 513.0, "completions/min_terminated_length": 513.0, "epoch": 0.844640434192673, "frac_reward_zero_std": 0.5, "grad_norm": 0.5424062013626099, "kl": 0.0, "learning_rate": 7.919254658385092e-08, "loss": -0.012, "num_tokens": 74807889.0, "reward": 1.0333333015441895, "reward_std": 0.25819891691207886, "rewards/correctness_reward_func/mean": 0.7333333492279053, "rewards/correctness_reward_func/std": 0.44585633277893066, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2490 }, { "completion_length": 766.0000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1205.0, "completions/max_terminated_length": 1205.0, "completions/mean_length": 766.0, "completions/mean_terminated_length": 766.0, "completions/min_length": 586.0, "completions/min_terminated_length": 586.0, "epoch": 0.8449796472184532, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 7.902001380262249e-08, "loss": 0.0, "num_tokens": 74827959.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2491 }, { "completion_length": 364.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/max_terminated_length": 566.0, "completions/mean_length": 364.8333435058594, "completions/mean_terminated_length": 364.8333435058594, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.8453188602442334, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 7.884748102139406e-08, "loss": 0.0, "num_tokens": 74845873.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2492 }, { "completion_length": 661.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 951.0, "completions/max_terminated_length": 951.0, "completions/mean_length": 661.1666870117188, "completions/mean_terminated_length": 661.1666870117188, "completions/min_length": 501.0, "completions/min_terminated_length": 501.0, "epoch": 0.8456580732700135, "frac_reward_zero_std": 1.0, "grad_norm": 1.3964061906790448e-07, "kl": 0.0, "learning_rate": 7.867494824016562e-08, "loss": 0.0, "num_tokens": 74868093.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2493 }, { "completion_length": 892.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1823.0, "completions/max_terminated_length": 1823.0, "completions/mean_length": 892.4166870117188, "completions/mean_terminated_length": 892.4166870117188, "completions/min_length": 407.0, "completions/min_terminated_length": 407.0, "epoch": 0.8459972862957937, "frac_reward_zero_std": 0.5, "grad_norm": 0.08629774302244186, "kl": 0.0, "learning_rate": 7.85024154589372e-08, "loss": 0.0014, "num_tokens": 74894390.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2494 }, { "completion_length": 1256.0833740234375, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 4825.0, "completions/mean_length": 2903.33349609375, "completions/mean_terminated_length": 1674.77783203125, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.846336499321574, "frac_reward_zero_std": 0.5, "grad_norm": 0.8037607669830322, "kl": NaN, "learning_rate": 7.832988267770877e-08, "loss": -0.0941, "num_tokens": 74922165.0, "reward": 0.4583333432674408, "reward_std": 0.33973026275634766, "rewards/correctness_reward_func/mean": 0.23333333432674408, "rewards/correctness_reward_func/std": 0.42497771978378296, "rewards/format_reward_func/mean": 0.2250000238418579, "rewards/format_reward_func/std": 0.13568010926246643, "step": 2495 }, { "completion_length": 931.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2062.0, "completions/max_terminated_length": 2062.0, "completions/mean_length": 931.4166870117188, "completions/mean_terminated_length": 931.4166870117188, "completions/min_length": 433.0, "completions/min_terminated_length": 433.0, "epoch": 0.8466757123473542, "frac_reward_zero_std": 0.0, "grad_norm": 0.42255064845085144, "kl": 0.0, "learning_rate": 7.815734989648033e-08, "loss": -0.0062, "num_tokens": 74948132.0, "reward": 0.8833333849906921, "reward_std": 0.26133137941360474, "rewards/correctness_reward_func/mean": 0.5833333134651184, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2496 }, { "completion_length": 1117.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2630.0, "completions/max_terminated_length": 2630.0, "completions/mean_length": 1117.0833740234375, "completions/mean_terminated_length": 1117.0833740234375, "completions/min_length": 439.0, "completions/min_terminated_length": 439.0, "epoch": 0.8470149253731343, "frac_reward_zero_std": 0.5, "grad_norm": 0.5731571912765503, "kl": 0.0, "learning_rate": 7.79848171152519e-08, "loss": 0.0228, "num_tokens": 74976201.0, "reward": 0.9500000476837158, "reward_std": 0.27386125922203064, "rewards/correctness_reward_func/mean": 0.6499999761581421, "rewards/correctness_reward_func/std": 0.40113475918769836, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2497 }, { "completion_length": 2579.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4985.0, "completions/max_terminated_length": 4985.0, "completions/mean_length": 2579.08349609375, "completions/mean_terminated_length": 2579.08349609375, "completions/min_length": 535.0, "completions/min_terminated_length": 535.0, "epoch": 0.8473541383989145, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 7.781228433402347e-08, "loss": 0.0, "num_tokens": 75022750.0, "reward": 0.30000001192092896, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2498 }, { "completion_length": 622.5833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 883.0, "completions/max_terminated_length": 883.0, "completions/mean_length": 622.5833740234375, "completions/mean_terminated_length": 622.5833740234375, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.8476933514246947, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 7.763975155279502e-08, "loss": 0.0, "num_tokens": 75040205.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2499 }, { "completion_length": 1083.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 1927.0, "completions/max_terminated_length": 1927.0, "completions/mean_length": 1083.416748046875, "completions/mean_terminated_length": 1083.416748046875, "completions/min_length": 667.0, "completions/min_terminated_length": 667.0, "epoch": 0.8480325644504749, "frac_reward_zero_std": 0.5, "grad_norm": 0.3294588327407837, "kl": 0.0, "learning_rate": 7.746721877156658e-08, "loss": -0.0015, "num_tokens": 75065848.0, "reward": 1.0333335399627686, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.7333333492279053, "rewards/correctness_reward_func/std": 0.23094011843204498, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2500 }, { "completion_length": 1343.3333740234375, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 4645.0, "completions/mean_length": 2441.5, "completions/mean_terminated_length": 1612.0, "completions/min_length": 731.0, "completions/min_terminated_length": 731.0, "epoch": 0.8483717774762551, "frac_reward_zero_std": 0.5, "grad_norm": 0.6518562436103821, "kl": NaN, "learning_rate": 7.729468599033815e-08, "loss": -0.0441, "num_tokens": 75093050.0, "reward": 1.024999976158142, "reward_std": 0.3061861991882324, "rewards/correctness_reward_func/mean": 0.75, "rewards/correctness_reward_func/std": 0.45226702094078064, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 2501 }, { "completion_length": 827.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1509.0, "completions/max_terminated_length": 1509.0, "completions/mean_length": 827.4166870117188, "completions/mean_terminated_length": 827.4166870117188, "completions/min_length": 390.0, "completions/min_terminated_length": 390.0, "epoch": 0.8487109905020352, "frac_reward_zero_std": 1.0, "grad_norm": 1.8679550350952923e-07, "kl": 0.0, "learning_rate": 7.712215320910973e-08, "loss": 0.0, "num_tokens": 75113641.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2502 }, { "completion_length": 943.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 3537.0, "completions/max_terminated_length": 3537.0, "completions/mean_length": 943.25, "completions/mean_terminated_length": 943.25, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.8490502035278155, "frac_reward_zero_std": 0.5, "grad_norm": 0.5133549571037292, "kl": 0.0, "learning_rate": 7.69496204278813e-08, "loss": -0.0265, "num_tokens": 75135802.0, "reward": 0.5, "reward_std": 0.21908903121948242, "rewards/correctness_reward_func/mean": 0.20000000298023224, "rewards/correctness_reward_func/std": 0.36181360483169556, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2503 }, { "completion_length": 841.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2016.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 841.3333740234375, "completions/mean_terminated_length": 841.3333740234375, "completions/min_length": 387.0, "completions/min_terminated_length": 387.0, "epoch": 0.8493894165535957, "frac_reward_zero_std": 0.5, "grad_norm": 0.08569307625293732, "kl": 0.0, "learning_rate": 7.677708764665286e-08, "loss": 0.0036, "num_tokens": 75155696.0, "reward": 1.1666667461395264, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2504 }, { "completion_length": 1275.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2122.0, "completions/max_terminated_length": 2122.0, "completions/mean_length": 1275.75, "completions/mean_terminated_length": 1275.75, "completions/min_length": 703.0, "completions/min_terminated_length": 703.0, "epoch": 0.8497286295793759, "frac_reward_zero_std": 0.5, "grad_norm": 0.0932522788643837, "kl": 0.0, "learning_rate": 7.660455486542443e-08, "loss": -0.001, "num_tokens": 75184913.0, "reward": 1.1500000953674316, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.8500000834465027, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2505 }, { "completion_length": 1057.1667175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 1729.0, "completions/max_terminated_length": 1729.0, "completions/mean_length": 1057.166748046875, "completions/mean_terminated_length": 1057.166748046875, "completions/min_length": 513.0, "completions/min_terminated_length": 513.0, "epoch": 0.850067842605156, "frac_reward_zero_std": 0.5, "grad_norm": 0.40471941232681274, "kl": 0.0, "learning_rate": 7.6432022084196e-08, "loss": -0.0062, "num_tokens": 75211993.0, "reward": 0.9666668176651001, "reward_std": 0.20655912160873413, "rewards/correctness_reward_func/mean": 0.6666666865348816, "rewards/correctness_reward_func/std": 0.31139957904815674, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2506 }, { "completion_length": 1839.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 4672.0, "completions/max_terminated_length": 4672.0, "completions/mean_length": 1839.25, "completions/mean_terminated_length": 1839.25, "completions/min_length": 556.0, "completions/min_terminated_length": 556.0, "epoch": 0.8504070556309362, "frac_reward_zero_std": 0.0, "grad_norm": 0.7421292662620544, "kl": 0.0, "learning_rate": 7.625948930296756e-08, "loss": -0.0233, "num_tokens": 75245824.0, "reward": 0.9000000357627869, "reward_std": 0.2892930209636688, "rewards/correctness_reward_func/mean": 0.5999999642372131, "rewards/correctness_reward_func/std": 0.45126086473464966, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2507 }, { "completion_length": 983.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3698.0, "completions/max_terminated_length": 3698.0, "completions/mean_length": 983.0833740234375, "completions/mean_terminated_length": 983.0833740234375, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.8507462686567164, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 7.608695652173913e-08, "loss": 0.0, "num_tokens": 75265331.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2508 }, { "completion_length": 925.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1731.0, "completions/max_terminated_length": 1731.0, "completions/mean_length": 925.4166870117188, "completions/mean_terminated_length": 925.4166870117188, "completions/min_length": 516.0, "completions/min_terminated_length": 516.0, "epoch": 0.8510854816824966, "frac_reward_zero_std": 1.0, "grad_norm": 2.21497089114564e-07, "kl": 0.0, "learning_rate": 7.59144237405107e-08, "loss": 0.0, "num_tokens": 75288442.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2509 }, { "completion_length": 1819.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3583.0, "completions/max_terminated_length": 3583.0, "completions/mean_length": 1819.0, "completions/mean_terminated_length": 1819.0, "completions/min_length": 788.0, "completions/min_terminated_length": 788.0, "epoch": 0.8514246947082768, "frac_reward_zero_std": 0.5, "grad_norm": 0.0790194571018219, "kl": 0.0, "learning_rate": 7.574189095928226e-08, "loss": 0.0002, "num_tokens": 75319720.0, "reward": 1.133333444595337, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2510 }, { "completion_length": 652.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1180.0, "completions/max_terminated_length": 1180.0, "completions/mean_length": 652.25, "completions/mean_terminated_length": 652.25, "completions/min_length": 378.0, "completions/min_terminated_length": 378.0, "epoch": 0.851763907734057, "frac_reward_zero_std": 1.0, "grad_norm": 1.6501120114753576e-07, "kl": 0.0, "learning_rate": 7.556935817805382e-08, "loss": 0.0, "num_tokens": 75339931.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2511 }, { "completion_length": 548.8333587646484, "completions/clipped_ratio": 0.0, "completions/max_length": 975.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 548.8333740234375, "completions/mean_terminated_length": 548.8333740234375, "completions/min_length": 374.0, "completions/min_terminated_length": 374.0, "epoch": 0.8521031207598372, "frac_reward_zero_std": 0.5, "grad_norm": 0.055393245071172714, "kl": 0.0, "learning_rate": 7.539682539682539e-08, "loss": -0.0005, "num_tokens": 75357023.0, "reward": 1.2666666507720947, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2512 }, { "completion_length": 1105.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1903.0, "completions/max_terminated_length": 1903.0, "completions/mean_length": 1105.8333740234375, "completions/mean_terminated_length": 1105.8333740234375, "completions/min_length": 345.0, "completions/min_terminated_length": 345.0, "epoch": 0.8524423337856174, "frac_reward_zero_std": 0.5, "grad_norm": 0.08649242669343948, "kl": 0.0, "learning_rate": 7.522429261559696e-08, "loss": -0.0008, "num_tokens": 75384705.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2513 }, { "completion_length": 1002.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1638.0, "completions/max_terminated_length": 1638.0, "completions/mean_length": 1002.9166870117188, "completions/mean_terminated_length": 1002.9166870117188, "completions/min_length": 448.0, "completions/min_terminated_length": 448.0, "epoch": 0.8527815468113975, "frac_reward_zero_std": 1.0, "grad_norm": 1.1649424891402305e-07, "kl": 0.0, "learning_rate": 7.505175983436852e-08, "loss": 0.0, "num_tokens": 75405752.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2514 }, { "completion_length": 1887.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3004.0, "completions/max_terminated_length": 3004.0, "completions/mean_length": 1887.166748046875, "completions/mean_terminated_length": 1887.166748046875, "completions/min_length": 846.0, "completions/min_terminated_length": 846.0, "epoch": 0.8531207598371777, "frac_reward_zero_std": 0.5, "grad_norm": 0.3501369059085846, "kl": 0.0, "learning_rate": 7.487922705314009e-08, "loss": -0.0073, "num_tokens": 75437242.0, "reward": 0.6499999761581421, "reward_std": 0.17606817185878754, "rewards/correctness_reward_func/mean": 0.3500000238418579, "rewards/correctness_reward_func/std": 0.4358898997306824, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2515 }, { "completion_length": 2332.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5199.0, "completions/max_terminated_length": 5199.0, "completions/mean_length": 2332.416748046875, "completions/mean_terminated_length": 2332.416748046875, "completions/min_length": 546.0, "completions/min_terminated_length": 546.0, "epoch": 0.8534599728629579, "frac_reward_zero_std": 0.5, "grad_norm": 0.0869336724281311, "kl": 0.0, "learning_rate": 7.470669427191165e-08, "loss": -0.001, "num_tokens": 75479127.0, "reward": 0.7666666507720947, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2516 }, { "completion_length": 587.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 701.0, "completions/max_terminated_length": 701.0, "completions/mean_length": 587.25, "completions/mean_terminated_length": 587.25, "completions/min_length": 433.0, "completions/min_terminated_length": 433.0, "epoch": 0.8537991858887382, "frac_reward_zero_std": 0.0, "grad_norm": 0.11688544601202011, "kl": 0.0, "learning_rate": 7.453416149068323e-08, "loss": -0.0002, "num_tokens": 75501810.0, "reward": 1.183333396911621, "reward_std": 0.10641198605298996, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2517 }, { "completion_length": 1590.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3336.0, "completions/max_terminated_length": 3336.0, "completions/mean_length": 1590.75, "completions/mean_terminated_length": 1590.75, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.8541383989145184, "frac_reward_zero_std": 1.0, "grad_norm": 1.3607984783448046e-07, "kl": 0.0, "learning_rate": 7.43616287094548e-08, "loss": 0.0, "num_tokens": 75537267.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2518 }, { "completion_length": 1809.25, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 4615.0, "completions/mean_length": 2358.33349609375, "completions/mean_terminated_length": 1973.727294921875, "completions/min_length": 671.0, "completions/min_terminated_length": 671.0, "epoch": 0.8544776119402985, "frac_reward_zero_std": 0.5, "grad_norm": 0.07336635887622833, "kl": NaN, "learning_rate": 7.418909592822637e-08, "loss": -0.0105, "num_tokens": 75568080.0, "reward": 0.6750000715255737, "reward_std": 0.06123724579811096, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 2519 }, { "completion_length": 1588.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5053.0, "completions/max_terminated_length": 5053.0, "completions/mean_length": 1588.666748046875, "completions/mean_terminated_length": 1588.666748046875, "completions/min_length": 615.0, "completions/min_terminated_length": 615.0, "epoch": 0.8548168249660787, "frac_reward_zero_std": 0.0, "grad_norm": 0.14108093082904816, "kl": 0.0, "learning_rate": 7.401656314699793e-08, "loss": -0.0063, "num_tokens": 75600656.0, "reward": 1.2000000476837158, "reward_std": 0.08164963126182556, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2520 }, { "completion_length": 1280.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2713.0, "completions/max_terminated_length": 2713.0, "completions/mean_length": 1280.8333740234375, "completions/mean_terminated_length": 1280.8333740234375, "completions/min_length": 538.0, "completions/min_terminated_length": 538.0, "epoch": 0.8551560379918589, "frac_reward_zero_std": 0.5, "grad_norm": 0.42569267749786377, "kl": 0.0, "learning_rate": 7.38440303657695e-08, "loss": -0.005, "num_tokens": 75624852.0, "reward": 0.7833334803581238, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.4833333194255829, "rewards/correctness_reward_func/std": 0.43029236793518066, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2521 }, { "completion_length": 963.8333435058594, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 3568.0, "completions/mean_length": 2611.08349609375, "completions/mean_terminated_length": 1285.111083984375, "completions/min_length": 437.0, "completions/min_terminated_length": 437.0, "epoch": 0.8554952510176391, "frac_reward_zero_std": 0.5, "grad_norm": 0.9885018467903137, "kl": NaN, "learning_rate": 7.367149758454105e-08, "loss": -0.0661, "num_tokens": 75649498.0, "reward": 0.9750000238418579, "reward_std": 0.35601967573165894, "rewards/correctness_reward_func/mean": 0.75, "rewards/correctness_reward_func/std": 0.45226702094078064, "rewards/format_reward_func/mean": 0.22500000894069672, "rewards/format_reward_func/std": 0.13568010926246643, "step": 2522 }, { "completion_length": 1184.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3301.0, "completions/max_terminated_length": 3301.0, "completions/mean_length": 1184.0833740234375, "completions/mean_terminated_length": 1184.0833740234375, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 0.8558344640434192, "frac_reward_zero_std": 0.5, "grad_norm": 0.28702667355537415, "kl": 0.0, "learning_rate": 7.349896480331262e-08, "loss": -0.0149, "num_tokens": 75676727.0, "reward": 1.2166666984558105, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2523 }, { "completion_length": 520.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 637.0, "completions/max_terminated_length": 637.0, "completions/mean_length": 520.8333740234375, "completions/mean_terminated_length": 520.8333740234375, "completions/min_length": 440.0, "completions/min_terminated_length": 440.0, "epoch": 0.8561736770691994, "frac_reward_zero_std": 0.5, "grad_norm": 0.05176064372062683, "kl": 0.0, "learning_rate": 7.332643202208418e-08, "loss": 0.0003, "num_tokens": 75696411.0, "reward": 1.2166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298573225736618, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2524 }, { "completion_length": 997.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 1607.0, "completions/max_terminated_length": 1607.0, "completions/mean_length": 997.5, "completions/mean_terminated_length": 997.5, "completions/min_length": 649.0, "completions/min_terminated_length": 649.0, "epoch": 0.8565128900949797, "frac_reward_zero_std": 0.0, "grad_norm": 0.09894651919603348, "kl": 0.0, "learning_rate": 7.315389924085576e-08, "loss": -0.0009, "num_tokens": 75721299.0, "reward": 1.2166666984558105, "reward_std": 0.09246456623077393, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2525 }, { "completion_length": 734.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1253.0, "completions/max_terminated_length": 1253.0, "completions/mean_length": 734.8333740234375, "completions/mean_terminated_length": 734.8333740234375, "completions/min_length": 525.0, "completions/min_terminated_length": 525.0, "epoch": 0.8568521031207599, "frac_reward_zero_std": 0.5, "grad_norm": 0.04775422066450119, "kl": 0.0, "learning_rate": 7.298136645962733e-08, "loss": -0.0002, "num_tokens": 75739273.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2526 }, { "completion_length": 635.0833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 944.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 635.0833740234375, "completions/mean_terminated_length": 635.0833740234375, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "epoch": 0.85719131614654, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 7.28088336783989e-08, "loss": 0.0, "num_tokens": 75759746.0, "reward": 0.30000001192092896, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2527 }, { "completion_length": 712.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1378.0, "completions/max_terminated_length": 1378.0, "completions/mean_length": 712.4166870117188, "completions/mean_terminated_length": 712.4166870117188, "completions/min_length": 431.0, "completions/min_terminated_length": 431.0, "epoch": 0.8575305291723202, "frac_reward_zero_std": 0.5, "grad_norm": 0.08618317544460297, "kl": 0.0, "learning_rate": 7.263630089717046e-08, "loss": 0.0002, "num_tokens": 75778045.0, "reward": 1.1500000953674316, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.8499999642372131, "rewards/correctness_reward_func/std": 0.09045340120792389, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2528 }, { "completion_length": 737.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1225.0, "completions/max_terminated_length": 1225.0, "completions/mean_length": 737.1666870117188, "completions/mean_terminated_length": 737.1666870117188, "completions/min_length": 444.0, "completions/min_terminated_length": 444.0, "epoch": 0.8578697421981004, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 7.246376811594203e-08, "loss": 0.0, "num_tokens": 75799593.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2529 }, { "completion_length": 655.0000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 899.0, "completions/max_terminated_length": 899.0, "completions/mean_length": 655.0, "completions/mean_terminated_length": 655.0, "completions/min_length": 503.0, "completions/min_terminated_length": 503.0, "epoch": 0.8582089552238806, "frac_reward_zero_std": 0.5, "grad_norm": 0.07153506577014923, "kl": 0.0, "learning_rate": 7.229123533471359e-08, "loss": 0.0, "num_tokens": 75821811.0, "reward": 0.7833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2530 }, { "completion_length": 1440.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2814.0, "completions/max_terminated_length": 2814.0, "completions/mean_length": 1440.25, "completions/mean_terminated_length": 1440.25, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "epoch": 0.8585481682496607, "frac_reward_zero_std": 1.0, "grad_norm": 8.3620889768099e-08, "kl": 0.0, "learning_rate": 7.211870255348516e-08, "loss": 0.0, "num_tokens": 75852690.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2531 }, { "completion_length": 1017.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1431.0, "completions/max_terminated_length": 1431.0, "completions/mean_length": 1017.1666870117188, "completions/mean_terminated_length": 1017.1666870117188, "completions/min_length": 733.0, "completions/min_terminated_length": 733.0, "epoch": 0.858887381275441, "frac_reward_zero_std": 1.0, "grad_norm": 1.8287475711531442e-07, "kl": 0.0, "learning_rate": 7.194616977225673e-08, "loss": 0.0, "num_tokens": 75880004.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2532 }, { "completion_length": 1314.2500610351562, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 3191.0, "completions/mean_length": 2412.416748046875, "completions/mean_terminated_length": 1577.0999755859375, "completions/min_length": 566.0, "completions/min_terminated_length": 566.0, "epoch": 0.8592265943012212, "frac_reward_zero_std": 0.5, "grad_norm": 0.751532793045044, "kl": NaN, "learning_rate": 7.177363699102829e-08, "loss": -0.0475, "num_tokens": 75906005.0, "reward": 0.949999988079071, "reward_std": 0.279284805059433, "rewards/correctness_reward_func/mean": 0.699999988079071, "rewards/correctness_reward_func/std": 0.43064433336257935, "rewards/format_reward_func/mean": 0.2500000298023224, "rewards/format_reward_func/std": 0.11677484214305878, "step": 2533 }, { "completion_length": 1386.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3020.0, "completions/max_terminated_length": 3020.0, "completions/mean_length": 1386.8333740234375, "completions/mean_terminated_length": 1386.8333740234375, "completions/min_length": 414.0, "completions/min_terminated_length": 414.0, "epoch": 0.8595658073270014, "frac_reward_zero_std": 0.5, "grad_norm": 0.08955075591802597, "kl": 0.0, "learning_rate": 7.160110420979986e-08, "loss": 0.0008, "num_tokens": 75934887.0, "reward": 0.7166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2534 }, { "completion_length": 630.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1600.0, "completions/max_terminated_length": 1600.0, "completions/mean_length": 630.1666870117188, "completions/mean_terminated_length": 630.1666870117188, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.8599050203527816, "frac_reward_zero_std": 0.5, "grad_norm": 0.0740213468670845, "kl": 0.0, "learning_rate": 7.142857142857142e-08, "loss": 0.0004, "num_tokens": 75955397.0, "reward": 0.7166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2535 }, { "completion_length": 671.0000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1319.0, "completions/max_terminated_length": 1319.0, "completions/mean_length": 671.0, "completions/mean_terminated_length": 671.0, "completions/min_length": 483.0, "completions/min_terminated_length": 483.0, "epoch": 0.8602442333785617, "frac_reward_zero_std": 0.5, "grad_norm": 0.08177324384450912, "kl": 0.0, "learning_rate": 7.125603864734299e-08, "loss": -0.0003, "num_tokens": 75976163.0, "reward": 1.25, "reward_std": 0.05477222427725792, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2536 }, { "completion_length": 1682.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 4670.0, "completions/max_terminated_length": 4670.0, "completions/mean_length": 1682.3333740234375, "completions/mean_terminated_length": 1682.3333740234375, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 0.8605834464043419, "frac_reward_zero_std": 0.5, "grad_norm": 0.5307982563972473, "kl": 0.0, "learning_rate": 7.108350586611456e-08, "loss": -0.0174, "num_tokens": 76007463.0, "reward": 0.8666666746139526, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.5666666626930237, "rewards/correctness_reward_func/std": 0.5033223032951355, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2537 }, { "completion_length": 686.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1321.0, "completions/max_terminated_length": 1321.0, "completions/mean_length": 686.3333740234375, "completions/mean_terminated_length": 686.3333740234375, "completions/min_length": 509.0, "completions/min_terminated_length": 509.0, "epoch": 0.8609226594301221, "frac_reward_zero_std": 1.0, "grad_norm": 8.935127482345706e-08, "kl": 0.0, "learning_rate": 7.091097308488612e-08, "loss": 0.0, "num_tokens": 76027501.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2538 }, { "completion_length": 1283.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3347.0, "completions/max_terminated_length": 3347.0, "completions/mean_length": 1283.5, "completions/mean_terminated_length": 1283.5, "completions/min_length": 535.0, "completions/min_terminated_length": 535.0, "epoch": 0.8612618724559024, "frac_reward_zero_std": 1.0, "grad_norm": 2.8325965217845805e-07, "kl": 0.0, "learning_rate": 7.073844030365769e-08, "loss": 0.0, "num_tokens": 76052605.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2539 }, { "completion_length": 587.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 788.0, "completions/max_terminated_length": 788.0, "completions/mean_length": 587.9166870117188, "completions/mean_terminated_length": 587.9166870117188, "completions/min_length": 418.0, "completions/min_terminated_length": 418.0, "epoch": 0.8616010854816825, "frac_reward_zero_std": 0.5, "grad_norm": 0.07899951934814453, "kl": 0.0, "learning_rate": 7.056590752242927e-08, "loss": 0.0001, "num_tokens": 76067334.0, "reward": 1.1500000953674316, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.8500000834465027, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2540 }, { "completion_length": 2373.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 6413.0, "completions/max_terminated_length": 6413.0, "completions/mean_length": 2373.666748046875, "completions/mean_terminated_length": 2373.666748046875, "completions/min_length": 570.0, "completions/min_terminated_length": 570.0, "epoch": 0.8619402985074627, "frac_reward_zero_std": 0.0, "grad_norm": 0.5389992594718933, "kl": 0.0, "learning_rate": 7.039337474120083e-08, "loss": 0.0133, "num_tokens": 76106120.0, "reward": 0.7833334803581238, "reward_std": 0.20412415266036987, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.43029236793518066, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2541 }, { "completion_length": 834.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1157.0, "completions/max_terminated_length": 1157.0, "completions/mean_length": 834.25, "completions/mean_terminated_length": 834.25, "completions/min_length": 564.0, "completions/min_terminated_length": 564.0, "epoch": 0.8622795115332429, "frac_reward_zero_std": 1.0, "grad_norm": 1.0475923062358561e-07, "kl": 0.0, "learning_rate": 7.02208419599724e-08, "loss": 0.0, "num_tokens": 76127627.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2542 }, { "completion_length": 967.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1442.0, "completions/max_terminated_length": 1442.0, "completions/mean_length": 967.0, "completions/mean_terminated_length": 967.0, "completions/min_length": 601.0, "completions/min_terminated_length": 601.0, "epoch": 0.8626187245590231, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 7.004830917874397e-08, "loss": 0.0, "num_tokens": 76151993.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2543 }, { "completion_length": 644.5833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 961.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 644.5833740234375, "completions/mean_terminated_length": 644.5833740234375, "completions/min_length": 444.0, "completions/min_terminated_length": 444.0, "epoch": 0.8629579375848032, "frac_reward_zero_std": 1.0, "grad_norm": 1.1870705662886394e-07, "kl": 0.0, "learning_rate": 6.987577639751552e-08, "loss": 0.0, "num_tokens": 76175970.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2544 }, { "completion_length": 1589.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 2837.0, "completions/max_terminated_length": 2837.0, "completions/mean_length": 1589.416748046875, "completions/mean_terminated_length": 1589.416748046875, "completions/min_length": 730.0, "completions/min_terminated_length": 730.0, "epoch": 0.8632971506105834, "frac_reward_zero_std": 0.5, "grad_norm": 0.09775371104478836, "kl": 0.0, "learning_rate": 6.970324361628708e-08, "loss": 0.0007, "num_tokens": 76205279.0, "reward": 0.7833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2545 }, { "completion_length": 1018.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1822.0, "completions/max_terminated_length": 1822.0, "completions/mean_length": 1018.75, "completions/mean_terminated_length": 1018.75, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "epoch": 0.8636363636363636, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 6.953071083505865e-08, "loss": 0.0, "num_tokens": 76225370.0, "reward": 0.30000001192092896, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2546 }, { "completion_length": 1076.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2096.0, "completions/max_terminated_length": 2096.0, "completions/mean_length": 1076.916748046875, "completions/mean_terminated_length": 1076.916748046875, "completions/min_length": 521.0, "completions/min_terminated_length": 521.0, "epoch": 0.8639755766621439, "frac_reward_zero_std": 0.5, "grad_norm": 0.08731096982955933, "kl": 0.0, "learning_rate": 6.935817805383022e-08, "loss": 0.0014, "num_tokens": 76249327.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2547 }, { "completion_length": 656.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1046.0, "completions/max_terminated_length": 1046.0, "completions/mean_length": 656.6666870117188, "completions/mean_terminated_length": 656.6666870117188, "completions/min_length": 421.0, "completions/min_terminated_length": 421.0, "epoch": 0.864314789687924, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 6.91856452726018e-08, "loss": 0.0, "num_tokens": 76274877.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2548 }, { "completion_length": 466.0833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 687.0, "completions/max_terminated_length": 687.0, "completions/mean_length": 466.0833435058594, "completions/mean_terminated_length": 466.0833435058594, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.8646540027137042, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 6.901311249137336e-08, "loss": 0.0, "num_tokens": 76293562.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2549 }, { "completion_length": 2264.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3602.0, "completions/max_terminated_length": 3602.0, "completions/mean_length": 2264.166748046875, "completions/mean_terminated_length": 2264.166748046875, "completions/min_length": 1129.0, "completions/min_terminated_length": 1129.0, "epoch": 0.8649932157394844, "frac_reward_zero_std": 1.0, "grad_norm": 1.6060471352830064e-07, "kl": 0.0, "learning_rate": 6.884057971014493e-08, "loss": 0.0, "num_tokens": 76332936.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2550 }, { "completion_length": 1697.2500610351562, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5630.0, "completions/mean_length": 2246.33349609375, "completions/mean_terminated_length": 1851.5455322265625, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.8653324287652646, "frac_reward_zero_std": 0.5, "grad_norm": 0.11818966269493103, "kl": NaN, "learning_rate": 6.86680469289165e-08, "loss": -0.011, "num_tokens": 76362813.0, "reward": 0.27500003576278687, "reward_std": 0.06123724579811096, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 2551 }, { "completion_length": 1835.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4051.0, "completions/max_terminated_length": 4051.0, "completions/mean_length": 1835.3333740234375, "completions/mean_terminated_length": 1835.3333740234375, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.8656716417910447, "frac_reward_zero_std": 0.5, "grad_norm": 0.13505332171916962, "kl": 0.0, "learning_rate": 6.849551414768806e-08, "loss": 0.0002, "num_tokens": 76393225.0, "reward": 0.7333333492279053, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.43333330750465393, "rewards/correctness_reward_func/std": 0.45792683959007263, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2552 }, { "completion_length": 810.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1628.0, "completions/max_terminated_length": 1628.0, "completions/mean_length": 810.8333740234375, "completions/mean_terminated_length": 810.8333740234375, "completions/min_length": 446.0, "completions/min_terminated_length": 446.0, "epoch": 0.8660108548168249, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 6.832298136645963e-08, "loss": 0.0, "num_tokens": 76417277.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2553 }, { "completion_length": 726.5833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1365.0, "completions/max_terminated_length": 1365.0, "completions/mean_length": 726.5833740234375, "completions/mean_terminated_length": 726.5833740234375, "completions/min_length": 455.0, "completions/min_terminated_length": 455.0, "epoch": 0.8663500678426052, "frac_reward_zero_std": 1.0, "grad_norm": 1.0829255359112722e-07, "kl": 0.0, "learning_rate": 6.815044858523119e-08, "loss": 0.0, "num_tokens": 76439070.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2554 }, { "completion_length": 1920.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3921.0, "completions/max_terminated_length": 3921.0, "completions/mean_length": 1920.166748046875, "completions/mean_terminated_length": 1920.166748046875, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "epoch": 0.8666892808683854, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 6.797791580400276e-08, "loss": 0.0, "num_tokens": 76474034.0, "reward": 0.30000001192092896, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2555 }, { "completion_length": 1715.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4019.0, "completions/max_terminated_length": 4019.0, "completions/mean_length": 1715.916748046875, "completions/mean_terminated_length": 1715.916748046875, "completions/min_length": 622.0, "completions/min_terminated_length": 622.0, "epoch": 0.8670284938941656, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 6.780538302277432e-08, "loss": 0.0, "num_tokens": 76501903.0, "reward": 0.30000001192092896, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2556 }, { "completion_length": 903.0000457763672, "completions/clipped_ratio": 0.0, "completions/max_length": 2304.0, "completions/max_terminated_length": 2304.0, "completions/mean_length": 903.0, "completions/mean_terminated_length": 903.0, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.8673677069199457, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 6.763285024154589e-08, "loss": 0.0, "num_tokens": 76523527.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2557 }, { "completion_length": 1078.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 1845.0, "completions/max_terminated_length": 1845.0, "completions/mean_length": 1078.25, "completions/mean_terminated_length": 1078.25, "completions/min_length": 422.0, "completions/min_terminated_length": 422.0, "epoch": 0.8677069199457259, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 6.746031746031746e-08, "loss": 0.0, "num_tokens": 76548532.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2558 }, { "completion_length": 648.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1499.0, "completions/max_terminated_length": 1499.0, "completions/mean_length": 648.8333740234375, "completions/mean_terminated_length": 648.8333740234375, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.8680461329715061, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 6.728778467908902e-08, "loss": 0.0, "num_tokens": 76568444.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2559 }, { "completion_length": 966.0833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 2913.0, "completions/max_terminated_length": 2913.0, "completions/mean_length": 966.0833740234375, "completions/mean_terminated_length": 966.0833740234375, "completions/min_length": 481.0, "completions/min_terminated_length": 481.0, "epoch": 0.8683853459972863, "frac_reward_zero_std": 0.5, "grad_norm": 0.1048663929104805, "kl": 0.0, "learning_rate": 6.711525189786059e-08, "loss": -0.0038, "num_tokens": 76593969.0, "reward": 1.2166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2560 }, { "completion_length": 787.0000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1422.0, "completions/max_terminated_length": 1422.0, "completions/mean_length": 787.0, "completions/mean_terminated_length": 787.0, "completions/min_length": 404.0, "completions/min_terminated_length": 404.0, "epoch": 0.8687245590230664, "frac_reward_zero_std": 0.5, "grad_norm": 0.07451961934566498, "kl": 0.0, "learning_rate": 6.694271911663215e-08, "loss": 0.0004, "num_tokens": 76615545.0, "reward": 1.133333444595337, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2561 }, { "completion_length": 756.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1281.0, "completions/max_terminated_length": 1281.0, "completions/mean_length": 756.3333740234375, "completions/mean_terminated_length": 756.3333740234375, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "epoch": 0.8690637720488467, "frac_reward_zero_std": 1.0, "grad_norm": 8.172396093186762e-08, "kl": 0.0, "learning_rate": 6.677018633540372e-08, "loss": 0.0, "num_tokens": 76634647.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2562 }, { "completion_length": 1018.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2843.0, "completions/max_terminated_length": 2843.0, "completions/mean_length": 1018.3333740234375, "completions/mean_terminated_length": 1018.3333740234375, "completions/min_length": 491.0, "completions/min_terminated_length": 491.0, "epoch": 0.8694029850746269, "frac_reward_zero_std": 0.5, "grad_norm": 0.3759530186653137, "kl": 0.0, "learning_rate": 6.659765355417529e-08, "loss": -0.0035, "num_tokens": 76659131.0, "reward": 1.2166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2563 }, { "completion_length": 567.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 746.0, "completions/max_terminated_length": 746.0, "completions/mean_length": 567.4166870117188, "completions/mean_terminated_length": 567.4166870117188, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "epoch": 0.8697421981004071, "frac_reward_zero_std": 0.5, "grad_norm": 0.06246677786111832, "kl": 0.0, "learning_rate": 6.642512077294687e-08, "loss": -0.0, "num_tokens": 76680046.0, "reward": 0.7333333492279053, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.43333330750465393, "rewards/correctness_reward_func/std": 0.45792683959007263, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2564 }, { "completion_length": 780.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1086.0, "completions/max_terminated_length": 1086.0, "completions/mean_length": 780.9166870117188, "completions/mean_terminated_length": 780.9166870117188, "completions/min_length": 467.0, "completions/min_terminated_length": 467.0, "epoch": 0.8700814111261872, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 6.625258799171843e-08, "loss": 0.0, "num_tokens": 76702809.0, "reward": 0.30000001192092896, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2565 }, { "completion_length": 657.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1323.0, "completions/max_terminated_length": 1323.0, "completions/mean_length": 657.9166870117188, "completions/mean_terminated_length": 657.9166870117188, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 0.8704206241519674, "frac_reward_zero_std": 0.5, "grad_norm": 0.06373082101345062, "kl": 0.0, "learning_rate": 6.608005521049e-08, "loss": -0.0004, "num_tokens": 76721798.0, "reward": 0.7166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2566 }, { "completion_length": 1481.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2390.0, "completions/max_terminated_length": 2390.0, "completions/mean_length": 1481.0, "completions/mean_terminated_length": 1481.0, "completions/min_length": 643.0, "completions/min_terminated_length": 643.0, "epoch": 0.8707598371777476, "frac_reward_zero_std": 0.5, "grad_norm": 0.588994026184082, "kl": 0.0, "learning_rate": 6.590752242926155e-08, "loss": -0.0114, "num_tokens": 76751816.0, "reward": 1.000000238418579, "reward_std": 0.23664319515228271, "rewards/correctness_reward_func/mean": 0.699999988079071, "rewards/correctness_reward_func/std": 0.3357488214969635, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2567 }, { "completion_length": 1522.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4205.0, "completions/max_terminated_length": 4205.0, "completions/mean_length": 1522.166748046875, "completions/mean_terminated_length": 1522.166748046875, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 0.8710990502035278, "frac_reward_zero_std": 0.5, "grad_norm": 0.09794683754444122, "kl": 0.0, "learning_rate": 6.573498964803312e-08, "loss": -0.0025, "num_tokens": 76785394.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2568 }, { "completion_length": 1248.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2117.0, "completions/max_terminated_length": 2117.0, "completions/mean_length": 1248.416748046875, "completions/mean_terminated_length": 1248.416748046875, "completions/min_length": 677.0, "completions/min_terminated_length": 677.0, "epoch": 0.871438263229308, "frac_reward_zero_std": 1.0, "grad_norm": 1.2705562824066874e-07, "kl": 0.0, "learning_rate": 6.556245686680468e-08, "loss": 0.0, "num_tokens": 76811883.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2569 }, { "completion_length": 687.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1285.0, "completions/max_terminated_length": 1285.0, "completions/mean_length": 687.75, "completions/mean_terminated_length": 687.75, "completions/min_length": 463.0, "completions/min_terminated_length": 463.0, "epoch": 0.8717774762550882, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 6.538992408557625e-08, "loss": 0.0, "num_tokens": 76827900.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2570 }, { "completion_length": 861.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1917.0, "completions/max_terminated_length": 1917.0, "completions/mean_length": 861.3333740234375, "completions/mean_terminated_length": 861.3333740234375, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.8721166892808684, "frac_reward_zero_std": 0.5, "grad_norm": 0.10454007238149643, "kl": 0.0, "learning_rate": 6.521739130434782e-08, "loss": 0.0006, "num_tokens": 76847704.0, "reward": 1.25, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2571 }, { "completion_length": 1425.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 5570.0, "completions/max_terminated_length": 5570.0, "completions/mean_length": 1425.5833740234375, "completions/mean_terminated_length": 1425.5833740234375, "completions/min_length": 575.0, "completions/min_terminated_length": 575.0, "epoch": 0.8724559023066486, "frac_reward_zero_std": 0.5, "grad_norm": 0.666414201259613, "kl": 0.0, "learning_rate": 6.50448585231194e-08, "loss": 0.0438, "num_tokens": 76872023.0, "reward": 1.1166667938232422, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.8166666030883789, "rewards/correctness_reward_func/std": 0.27579087018966675, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2572 }, { "completion_length": 1069.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 1595.0, "completions/max_terminated_length": 1595.0, "completions/mean_length": 1069.0, "completions/mean_terminated_length": 1069.0, "completions/min_length": 624.0, "completions/min_terminated_length": 624.0, "epoch": 0.8727951153324288, "frac_reward_zero_std": 0.5, "grad_norm": 0.06653976440429688, "kl": 0.0, "learning_rate": 6.487232574189096e-08, "loss": -0.0003, "num_tokens": 76896443.0, "reward": 0.7833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2573 }, { "completion_length": 713.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1412.0, "completions/max_terminated_length": 1412.0, "completions/mean_length": 713.6666870117188, "completions/mean_terminated_length": 713.6666870117188, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "epoch": 0.8731343283582089, "frac_reward_zero_std": 1.0, "grad_norm": 9.758922203673137e-08, "kl": 0.0, "learning_rate": 6.469979296066253e-08, "loss": 0.0, "num_tokens": 76917409.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2574 }, { "completion_length": 1103.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2472.0, "completions/max_terminated_length": 2472.0, "completions/mean_length": 1103.3333740234375, "completions/mean_terminated_length": 1103.3333740234375, "completions/min_length": 508.0, "completions/min_terminated_length": 508.0, "epoch": 0.8734735413839891, "frac_reward_zero_std": 0.0, "grad_norm": 0.5593957901000977, "kl": 0.0, "learning_rate": 6.452726017943409e-08, "loss": 0.0193, "num_tokens": 76941701.0, "reward": 0.6666666865348816, "reward_std": 0.4647580087184906, "rewards/correctness_reward_func/mean": 0.36666664481163025, "rewards/correctness_reward_func/std": 0.45792677998542786, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2575 }, { "completion_length": 968.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4357.0, "completions/max_terminated_length": 4357.0, "completions/mean_length": 968.5833740234375, "completions/mean_terminated_length": 968.5833740234375, "completions/min_length": 371.0, "completions/min_terminated_length": 371.0, "epoch": 0.8738127544097694, "frac_reward_zero_std": 0.0, "grad_norm": 0.5786733627319336, "kl": 0.0, "learning_rate": 6.435472739820566e-08, "loss": -0.0381, "num_tokens": 76969746.0, "reward": 0.8500000834465027, "reward_std": 0.2557638883590698, "rewards/correctness_reward_func/mean": 0.550000011920929, "rewards/correctness_reward_func/std": 0.49082493782043457, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2576 }, { "completion_length": 1134.4167175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 3361.0, "completions/max_terminated_length": 3361.0, "completions/mean_length": 1134.416748046875, "completions/mean_terminated_length": 1134.416748046875, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "epoch": 0.8741519674355496, "frac_reward_zero_std": 0.5, "grad_norm": 0.07009387761354446, "kl": 0.0, "learning_rate": 6.418219461697722e-08, "loss": 0.0004, "num_tokens": 76995137.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2577 }, { "completion_length": 1624.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3518.0, "completions/max_terminated_length": 3518.0, "completions/mean_length": 1624.916748046875, "completions/mean_terminated_length": 1624.916748046875, "completions/min_length": 594.0, "completions/min_terminated_length": 594.0, "epoch": 0.8744911804613297, "frac_reward_zero_std": 0.0, "grad_norm": 0.39385277032852173, "kl": 0.0, "learning_rate": 6.400966183574879e-08, "loss": -0.0044, "num_tokens": 77027038.0, "reward": 1.0333333015441895, "reward_std": 0.2581988573074341, "rewards/correctness_reward_func/mean": 0.7333332896232605, "rewards/correctness_reward_func/std": 0.3550501763820648, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2578 }, { "completion_length": 830.5000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1536.0, "completions/max_terminated_length": 1536.0, "completions/mean_length": 830.5, "completions/mean_terminated_length": 830.5, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "epoch": 0.8748303934871099, "frac_reward_zero_std": 0.0, "grad_norm": 0.5860399007797241, "kl": 0.0, "learning_rate": 6.383712905452034e-08, "loss": -0.0112, "num_tokens": 77049688.0, "reward": 0.8833333849906921, "reward_std": 0.46232303977012634, "rewards/correctness_reward_func/mean": 0.5833333134651184, "rewards/correctness_reward_func/std": 0.5149286389350891, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2579 }, { "completion_length": 833.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1391.0, "completions/max_terminated_length": 1391.0, "completions/mean_length": 833.9166870117188, "completions/mean_terminated_length": 833.9166870117188, "completions/min_length": 526.0, "completions/min_terminated_length": 526.0, "epoch": 0.8751696065128901, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 6.366459627329192e-08, "loss": 0.0, "num_tokens": 77076423.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2580 }, { "completion_length": 1676.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3612.0, "completions/max_terminated_length": 3612.0, "completions/mean_length": 1676.3333740234375, "completions/mean_terminated_length": 1676.3333740234375, "completions/min_length": 362.0, "completions/min_terminated_length": 362.0, "epoch": 0.8755088195386703, "frac_reward_zero_std": 0.5, "grad_norm": 0.050853174179792404, "kl": 0.0, "learning_rate": 6.349206349206349e-08, "loss": -0.0002, "num_tokens": 77105917.0, "reward": 0.7833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2581 }, { "completion_length": 1039.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2013.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 1039.5833740234375, "completions/mean_terminated_length": 1039.5833740234375, "completions/min_length": 479.0, "completions/min_terminated_length": 479.0, "epoch": 0.8758480325644504, "frac_reward_zero_std": 0.5, "grad_norm": 0.44827762246131897, "kl": 0.0, "learning_rate": 6.331953071083506e-08, "loss": 0.0052, "num_tokens": 77129744.0, "reward": 0.6833333969116211, "reward_std": 0.19407902657985687, "rewards/correctness_reward_func/mean": 0.38333332538604736, "rewards/correctness_reward_func/std": 0.4783177673816681, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2582 }, { "completion_length": 2618.2501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 5060.0, "completions/max_terminated_length": 5060.0, "completions/mean_length": 2618.25, "completions/mean_terminated_length": 2618.25, "completions/min_length": 717.0, "completions/min_terminated_length": 717.0, "epoch": 0.8761872455902306, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 6.314699792960662e-08, "loss": 0.0, "num_tokens": 77172359.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2583 }, { "completion_length": 699.5000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1271.0, "completions/max_terminated_length": 1271.0, "completions/mean_length": 699.5, "completions/mean_terminated_length": 699.5, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "epoch": 0.8765264586160109, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 6.297446514837819e-08, "loss": 0.0, "num_tokens": 77190149.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2584 }, { "completion_length": 1471.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 2434.0, "completions/max_terminated_length": 2434.0, "completions/mean_length": 1471.666748046875, "completions/mean_terminated_length": 1471.666748046875, "completions/min_length": 769.0, "completions/min_terminated_length": 769.0, "epoch": 0.8768656716417911, "frac_reward_zero_std": 0.5, "grad_norm": 0.06677349656820297, "kl": 0.0, "learning_rate": 6.280193236714975e-08, "loss": -0.0001, "num_tokens": 77219065.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2585 }, { "completion_length": 1491.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 5722.0, "completions/max_terminated_length": 5722.0, "completions/mean_length": 1491.5833740234375, "completions/mean_terminated_length": 1491.5833740234375, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "epoch": 0.8772048846675712, "frac_reward_zero_std": 0.5, "grad_norm": 0.08529601991176605, "kl": 0.0, "learning_rate": 6.262939958592132e-08, "loss": -0.0008, "num_tokens": 77252240.0, "reward": 1.2333333492279053, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.9333333373069763, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2586 }, { "completion_length": 1062.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 2346.0, "completions/max_terminated_length": 2346.0, "completions/mean_length": 1062.3333740234375, "completions/mean_terminated_length": 1062.3333740234375, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 0.8775440976933514, "frac_reward_zero_std": 0.5, "grad_norm": 0.40881749987602234, "kl": 0.0, "learning_rate": 6.245686680469289e-08, "loss": 0.0057, "num_tokens": 77275146.0, "reward": 1.133333444595337, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.2806917726993561, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2587 }, { "completion_length": 1563.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 4218.0, "completions/max_terminated_length": 4218.0, "completions/mean_length": 1563.916748046875, "completions/mean_terminated_length": 1563.916748046875, "completions/min_length": 522.0, "completions/min_terminated_length": 522.0, "epoch": 0.8778833107191316, "frac_reward_zero_std": 0.5, "grad_norm": 0.5133988261222839, "kl": 0.0, "learning_rate": 6.228433402346445e-08, "loss": 0.0057, "num_tokens": 77308973.0, "reward": 1.1500000953674316, "reward_std": 0.17606817185878754, "rewards/correctness_reward_func/mean": 0.8499999642372131, "rewards/correctness_reward_func/std": 0.28444525599479675, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2588 }, { "completion_length": 1290.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3653.0, "completions/max_terminated_length": 3653.0, "completions/mean_length": 1290.666748046875, "completions/mean_terminated_length": 1290.666748046875, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 0.8782225237449118, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 6.211180124223602e-08, "loss": 0.0, "num_tokens": 77334157.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2589 }, { "completion_length": 1326.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2433.0, "completions/max_terminated_length": 2433.0, "completions/mean_length": 1326.416748046875, "completions/mean_terminated_length": 1326.416748046875, "completions/min_length": 537.0, "completions/min_terminated_length": 537.0, "epoch": 0.878561736770692, "frac_reward_zero_std": 1.0, "grad_norm": 3.038158808976732e-07, "kl": 0.0, "learning_rate": 6.193926846100758e-08, "loss": 0.0, "num_tokens": 77362626.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2590 }, { "completion_length": 1474.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3111.0, "completions/max_terminated_length": 3111.0, "completions/mean_length": 1474.0833740234375, "completions/mean_terminated_length": 1474.0833740234375, "completions/min_length": 366.0, "completions/min_terminated_length": 366.0, "epoch": 0.8789009497964722, "frac_reward_zero_std": 0.0, "grad_norm": 0.8145392537117004, "kl": 0.0, "learning_rate": 6.176673567977916e-08, "loss": 0.0029, "num_tokens": 77393443.0, "reward": 0.9666666388511658, "reward_std": 0.4779853820800781, "rewards/correctness_reward_func/mean": 0.6666666865348816, "rewards/correctness_reward_func/std": 0.4923659861087799, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2591 }, { "completion_length": 1733.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 4026.0, "completions/max_terminated_length": 4026.0, "completions/mean_length": 1733.166748046875, "completions/mean_terminated_length": 1733.166748046875, "completions/min_length": 771.0, "completions/min_terminated_length": 771.0, "epoch": 0.8792401628222524, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 6.159420289855073e-08, "loss": 0.0, "num_tokens": 77427969.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2592 }, { "completion_length": 2134.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 5769.0, "completions/max_terminated_length": 5769.0, "completions/mean_length": 2134.416748046875, "completions/mean_terminated_length": 2134.416748046875, "completions/min_length": 380.0, "completions/min_terminated_length": 380.0, "epoch": 0.8795793758480326, "frac_reward_zero_std": 0.5, "grad_norm": 0.5720961689949036, "kl": 0.0, "learning_rate": 6.142167011732228e-08, "loss": 0.0111, "num_tokens": 77465024.0, "reward": 0.833333432674408, "reward_std": 0.20655912160873413, "rewards/correctness_reward_func/mean": 0.5333333611488342, "rewards/correctness_reward_func/std": 0.3938928246498108, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2593 }, { "completion_length": 381.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/max_terminated_length": 584.0, "completions/mean_length": 381.8333435058594, "completions/mean_terminated_length": 381.8333435058594, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.8799185888738128, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 6.124913733609385e-08, "loss": 0.0, "num_tokens": 77481402.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2594 }, { "completion_length": 635.5000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1814.0, "completions/max_terminated_length": 1814.0, "completions/mean_length": 635.5, "completions/mean_terminated_length": 635.5, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.8802578018995929, "frac_reward_zero_std": 0.5, "grad_norm": 0.2813169062137604, "kl": 0.0, "learning_rate": 6.107660455486543e-08, "loss": 0.0002, "num_tokens": 77497566.0, "reward": 1.2166666984558105, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2595 }, { "completion_length": 788.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1239.0, "completions/max_terminated_length": 1239.0, "completions/mean_length": 788.25, "completions/mean_terminated_length": 788.25, "completions/min_length": 591.0, "completions/min_terminated_length": 591.0, "epoch": 0.8805970149253731, "frac_reward_zero_std": 1.0, "grad_norm": 1.7701631804811768e-07, "kl": 0.0, "learning_rate": 6.090407177363699e-08, "loss": 0.0, "num_tokens": 77522415.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2596 }, { "completion_length": 1322.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3850.0, "completions/max_terminated_length": 3850.0, "completions/mean_length": 1322.75, "completions/mean_terminated_length": 1322.75, "completions/min_length": 489.0, "completions/min_terminated_length": 489.0, "epoch": 0.8809362279511533, "frac_reward_zero_std": 0.5, "grad_norm": 0.08548951148986816, "kl": 0.0, "learning_rate": 6.073153899240856e-08, "loss": -0.0006, "num_tokens": 77550726.0, "reward": 1.25, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2597 }, { "completion_length": 855.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1784.0, "completions/max_terminated_length": 1784.0, "completions/mean_length": 855.1666870117188, "completions/mean_terminated_length": 855.1666870117188, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "epoch": 0.8812754409769336, "frac_reward_zero_std": 1.0, "grad_norm": 7.856421291307925e-08, "kl": 0.0, "learning_rate": 6.055900621118013e-08, "loss": 0.0, "num_tokens": 77574170.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2598 }, { "completion_length": 646.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 827.0, "completions/max_terminated_length": 827.0, "completions/mean_length": 646.1666870117188, "completions/mean_terminated_length": 646.1666870117188, "completions/min_length": 489.0, "completions/min_terminated_length": 489.0, "epoch": 0.8816146540027137, "frac_reward_zero_std": 0.5, "grad_norm": 0.060372017323970795, "kl": 0.0, "learning_rate": 6.038647342995169e-08, "loss": -0.0007, "num_tokens": 77594896.0, "reward": 1.2666666507720947, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2599 }, { "completion_length": 896.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3248.0, "completions/max_terminated_length": 3248.0, "completions/mean_length": 896.1666870117188, "completions/mean_terminated_length": 896.1666870117188, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "epoch": 0.8819538670284939, "frac_reward_zero_std": 0.0, "grad_norm": 0.12935520708560944, "kl": 0.0, "learning_rate": 6.021394064872326e-08, "loss": 0.005, "num_tokens": 77616810.0, "reward": 1.183333396911621, "reward_std": 0.09246458858251572, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2600 }, { "completion_length": 1932.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4384.0, "completions/max_terminated_length": 4384.0, "completions/mean_length": 1932.916748046875, "completions/mean_terminated_length": 1932.916748046875, "completions/min_length": 546.0, "completions/min_terminated_length": 546.0, "epoch": 0.8822930800542741, "frac_reward_zero_std": 1.0, "grad_norm": 8.42230392095189e-08, "kl": 0.0, "learning_rate": 6.004140786749482e-08, "loss": 0.0, "num_tokens": 77649653.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2601 }, { "completion_length": 985.5833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 2253.0, "completions/max_terminated_length": 2253.0, "completions/mean_length": 985.5833740234375, "completions/mean_terminated_length": 985.5833740234375, "completions/min_length": 371.0, "completions/min_terminated_length": 371.0, "epoch": 0.8826322930800543, "frac_reward_zero_std": 0.5, "grad_norm": 0.09111157059669495, "kl": 0.0, "learning_rate": 5.986887508626639e-08, "loss": -0.0032, "num_tokens": 77674836.0, "reward": 1.133333444595337, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2602 }, { "completion_length": 988.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1696.0, "completions/max_terminated_length": 1696.0, "completions/mean_length": 988.3333740234375, "completions/mean_terminated_length": 988.3333740234375, "completions/min_length": 493.0, "completions/min_terminated_length": 493.0, "epoch": 0.8829715061058344, "frac_reward_zero_std": 1.0, "grad_norm": 1.1221291629226471e-07, "kl": 0.0, "learning_rate": 5.969634230503796e-08, "loss": 0.0, "num_tokens": 77698642.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2603 }, { "completion_length": 1352.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 4416.0, "completions/max_terminated_length": 4416.0, "completions/mean_length": 1352.5, "completions/mean_terminated_length": 1352.5, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.8833107191316146, "frac_reward_zero_std": 0.5, "grad_norm": 0.09140629321336746, "kl": 0.0, "learning_rate": 5.9523809523809515e-08, "loss": -0.0007, "num_tokens": 77727346.0, "reward": 1.2166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2604 }, { "completion_length": 1090.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2255.0, "completions/max_terminated_length": 2255.0, "completions/mean_length": 1090.416748046875, "completions/mean_terminated_length": 1090.416748046875, "completions/min_length": 437.0, "completions/min_terminated_length": 437.0, "epoch": 0.8836499321573948, "frac_reward_zero_std": 1.0, "grad_norm": 1.8499743248412415e-07, "kl": 0.0, "learning_rate": 5.935127674258109e-08, "loss": 0.0, "num_tokens": 77753349.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2605 }, { "completion_length": 800.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1359.0, "completions/max_terminated_length": 1359.0, "completions/mean_length": 800.75, "completions/mean_terminated_length": 800.75, "completions/min_length": 530.0, "completions/min_terminated_length": 530.0, "epoch": 0.8839891451831751, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 5.9178743961352654e-08, "loss": 0.0, "num_tokens": 77775804.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2606 }, { "completion_length": 643.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1067.0, "completions/max_terminated_length": 1067.0, "completions/mean_length": 643.6666870117188, "completions/mean_terminated_length": 643.6666870117188, "completions/min_length": 417.0, "completions/min_terminated_length": 417.0, "epoch": 0.8843283582089553, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 5.900621118012422e-08, "loss": 0.0, "num_tokens": 77800454.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2607 }, { "completion_length": 884.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1746.0, "completions/max_terminated_length": 1746.0, "completions/mean_length": 884.4166870117188, "completions/mean_terminated_length": 884.4166870117188, "completions/min_length": 418.0, "completions/min_terminated_length": 418.0, "epoch": 0.8846675712347354, "frac_reward_zero_std": 0.5, "grad_norm": 0.1081530749797821, "kl": 0.0, "learning_rate": 5.8833678398895786e-08, "loss": 0.0023, "num_tokens": 77823685.0, "reward": 1.1666667461395264, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2608 }, { "completion_length": 2016.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4460.0, "completions/max_terminated_length": 4460.0, "completions/mean_length": 2016.5833740234375, "completions/mean_terminated_length": 2016.5833740234375, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 0.8850067842605156, "frac_reward_zero_std": 0.5, "grad_norm": 0.5063836574554443, "kl": 0.0, "learning_rate": 5.866114561766736e-08, "loss": -0.0013, "num_tokens": 77851718.0, "reward": 1.2000000476837158, "reward_std": 0.19999998807907104, "rewards/correctness_reward_func/mean": 0.9000000357627869, "rewards/correctness_reward_func/std": 0.28919950127601624, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2609 }, { "completion_length": 971.5000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 3913.0, "completions/max_terminated_length": 3913.0, "completions/mean_length": 971.5, "completions/mean_terminated_length": 971.5, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 0.8853459972862958, "frac_reward_zero_std": 0.5, "grad_norm": 0.5498670339584351, "kl": 0.0, "learning_rate": 5.848861283643892e-08, "loss": -0.021, "num_tokens": 77870402.0, "reward": 0.46666666865348816, "reward_std": 0.2581988573074341, "rewards/correctness_reward_func/mean": 0.1666666716337204, "rewards/correctness_reward_func/std": 0.3892494738101959, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2610 }, { "completion_length": 1295.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2441.0, "completions/max_terminated_length": 2441.0, "completions/mean_length": 1295.75, "completions/mean_terminated_length": 1295.75, "completions/min_length": 511.0, "completions/min_terminated_length": 511.0, "epoch": 0.885685210312076, "frac_reward_zero_std": 1.0, "grad_norm": 2.0806385236937786e-07, "kl": 0.0, "learning_rate": 5.8316080055210484e-08, "loss": 0.0, "num_tokens": 77901197.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2611 }, { "completion_length": 2521.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 6079.0, "completions/max_terminated_length": 6079.0, "completions/mean_length": 2521.666748046875, "completions/mean_terminated_length": 2521.666748046875, "completions/min_length": 1095.0, "completions/min_terminated_length": 1095.0, "epoch": 0.8860244233378561, "frac_reward_zero_std": 0.0, "grad_norm": 0.5634554624557495, "kl": 0.0, "learning_rate": 5.814354727398205e-08, "loss": -0.037, "num_tokens": 77942479.0, "reward": 0.783333420753479, "reward_std": 0.38262733817100525, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.43029239773750305, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2612 }, { "completion_length": 779.5833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1799.0, "completions/max_terminated_length": 1799.0, "completions/mean_length": 779.5833740234375, "completions/mean_terminated_length": 779.5833740234375, "completions/min_length": 442.0, "completions/min_terminated_length": 442.0, "epoch": 0.8863636363636364, "frac_reward_zero_std": 0.0, "grad_norm": 0.09818348288536072, "kl": 0.0, "learning_rate": 5.797101449275362e-08, "loss": -0.0017, "num_tokens": 77961728.0, "reward": 1.1666667461395264, "reward_std": 0.09559707343578339, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2613 }, { "completion_length": 1173.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2220.0, "completions/max_terminated_length": 2220.0, "completions/mean_length": 1173.666748046875, "completions/mean_terminated_length": 1173.666748046875, "completions/min_length": 640.0, "completions/min_terminated_length": 640.0, "epoch": 0.8867028493894166, "frac_reward_zero_std": 1.0, "grad_norm": 1.4795979552673089e-07, "kl": 0.0, "learning_rate": 5.779848171152519e-08, "loss": 0.0, "num_tokens": 77991040.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2614 }, { "completion_length": 1476.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2562.0, "completions/max_terminated_length": 2562.0, "completions/mean_length": 1476.416748046875, "completions/mean_terminated_length": 1476.416748046875, "completions/min_length": 911.0, "completions/min_terminated_length": 911.0, "epoch": 0.8870420624151968, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 5.7625948930296755e-08, "loss": 0.0, "num_tokens": 78018327.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2615 }, { "completion_length": 1600.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3597.0, "completions/max_terminated_length": 3597.0, "completions/mean_length": 1600.0, "completions/mean_terminated_length": 1600.0, "completions/min_length": 521.0, "completions/min_terminated_length": 521.0, "epoch": 0.8873812754409769, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 5.7453416149068315e-08, "loss": 0.0, "num_tokens": 78048801.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2616 }, { "completion_length": 1206.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2540.0, "completions/max_terminated_length": 2540.0, "completions/mean_length": 1206.5833740234375, "completions/mean_terminated_length": 1206.5833740234375, "completions/min_length": 721.0, "completions/min_terminated_length": 721.0, "epoch": 0.8877204884667571, "frac_reward_zero_std": 0.0, "grad_norm": 0.13578635454177856, "kl": 0.0, "learning_rate": 5.728088336783989e-08, "loss": 0.0002, "num_tokens": 78077824.0, "reward": 1.2000000476837158, "reward_std": 0.10327951610088348, "rewards/correctness_reward_func/mean": 0.9000000357627869, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2617 }, { "completion_length": 1687.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4813.0, "completions/max_terminated_length": 4813.0, "completions/mean_length": 1687.166748046875, "completions/mean_terminated_length": 1687.166748046875, "completions/min_length": 494.0, "completions/min_terminated_length": 494.0, "epoch": 0.8880597014925373, "frac_reward_zero_std": 0.0, "grad_norm": 0.6502511501312256, "kl": 0.0, "learning_rate": 5.710835058661145e-08, "loss": 0.0252, "num_tokens": 78114684.0, "reward": 1.183333396911621, "reward_std": 0.2557638883590698, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2618 }, { "completion_length": 1410.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 2422.0, "completions/max_terminated_length": 2422.0, "completions/mean_length": 1410.166748046875, "completions/mean_terminated_length": 1410.166748046875, "completions/min_length": 531.0, "completions/min_terminated_length": 531.0, "epoch": 0.8883989145183175, "frac_reward_zero_std": 1.0, "grad_norm": 1.8545988211826625e-07, "kl": 0.0, "learning_rate": 5.693581780538302e-08, "loss": 0.0, "num_tokens": 78145016.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2619 }, { "completion_length": 1550.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3525.0, "completions/max_terminated_length": 3525.0, "completions/mean_length": 1550.0, "completions/mean_terminated_length": 1550.0, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "epoch": 0.8887381275440976, "frac_reward_zero_std": 1.0, "grad_norm": 1.372263369603388e-07, "kl": 0.0, "learning_rate": 5.676328502415459e-08, "loss": 0.0, "num_tokens": 78178064.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2620 }, { "completion_length": 1485.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2783.0, "completions/max_terminated_length": 2783.0, "completions/mean_length": 1485.75, "completions/mean_terminated_length": 1485.75, "completions/min_length": 759.0, "completions/min_terminated_length": 759.0, "epoch": 0.8890773405698779, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 5.659075224292615e-08, "loss": 0.0, "num_tokens": 78205391.0, "reward": 0.30000001192092896, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2621 }, { "completion_length": 631.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1003.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 631.75, "completions/mean_terminated_length": 631.75, "completions/min_length": 380.0, "completions/min_terminated_length": 380.0, "epoch": 0.8894165535956581, "frac_reward_zero_std": 1.0, "grad_norm": 1.333366981270956e-07, "kl": 0.0, "learning_rate": 5.641821946169772e-08, "loss": 0.0, "num_tokens": 78223184.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2622 }, { "completion_length": 564.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 927.0, "completions/max_terminated_length": 927.0, "completions/mean_length": 564.8333740234375, "completions/mean_terminated_length": 564.8333740234375, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "epoch": 0.8897557666214383, "frac_reward_zero_std": 0.5, "grad_norm": 0.06118715554475784, "kl": 0.0, "learning_rate": 5.6245686680469284e-08, "loss": -0.0003, "num_tokens": 78243006.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2623 }, { "completion_length": 822.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2280.0, "completions/max_terminated_length": 2280.0, "completions/mean_length": 822.4166870117188, "completions/mean_terminated_length": 822.4166870117188, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "epoch": 0.8900949796472184, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 5.6073153899240856e-08, "loss": 0.0, "num_tokens": 78265751.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2624 }, { "completion_length": 2290.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 5855.0, "completions/max_terminated_length": 5855.0, "completions/mean_length": 2290.5, "completions/mean_terminated_length": 2290.5, "completions/min_length": 494.0, "completions/min_terminated_length": 494.0, "epoch": 0.8904341926729986, "frac_reward_zero_std": 0.5, "grad_norm": 0.9807217121124268, "kl": 0.0, "learning_rate": 5.590062111801242e-08, "loss": 0.0293, "num_tokens": 78302675.0, "reward": 0.9500000476837158, "reward_std": 0.27386125922203064, "rewards/correctness_reward_func/mean": 0.6499999761581421, "rewards/correctness_reward_func/std": 0.40113475918769836, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2625 }, { "completion_length": 594.0000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 791.0, "completions/max_terminated_length": 791.0, "completions/mean_length": 594.0, "completions/mean_terminated_length": 594.0, "completions/min_length": 456.0, "completions/min_terminated_length": 456.0, "epoch": 0.8907734056987788, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 5.572808833678399e-08, "loss": 0.0, "num_tokens": 78319637.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2626 }, { "completion_length": 1932.5, "completions/clipped_ratio": 0.0, "completions/max_length": 5957.0, "completions/max_terminated_length": 5957.0, "completions/mean_length": 1932.5, "completions/mean_terminated_length": 1932.5, "completions/min_length": 497.0, "completions/min_terminated_length": 497.0, "epoch": 0.891112618724559, "frac_reward_zero_std": 0.5, "grad_norm": 0.4628717303276062, "kl": 0.0, "learning_rate": 5.555555555555555e-08, "loss": 0.001, "num_tokens": 78356141.0, "reward": 0.5499999523162842, "reward_std": 0.27386125922203064, "rewards/correctness_reward_func/mean": 0.25, "rewards/correctness_reward_func/std": 0.45226702094078064, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2627 }, { "completion_length": 1744.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3541.0, "completions/max_terminated_length": 3541.0, "completions/mean_length": 1744.916748046875, "completions/mean_terminated_length": 1744.916748046875, "completions/min_length": 448.0, "completions/min_terminated_length": 448.0, "epoch": 0.8914518317503393, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 5.538302277432712e-08, "loss": 0.0, "num_tokens": 78390310.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2628 }, { "completion_length": 429.25, "completions/clipped_ratio": 0.0, "completions/max_length": 601.0, "completions/max_terminated_length": 601.0, "completions/mean_length": 429.25, "completions/mean_terminated_length": 429.25, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.8917910447761194, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 5.5210489993098687e-08, "loss": 0.0, "num_tokens": 78408553.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2629 }, { "completion_length": 742.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1059.0, "completions/max_terminated_length": 1059.0, "completions/mean_length": 742.1666870117188, "completions/mean_terminated_length": 742.1666870117188, "completions/min_length": 510.0, "completions/min_terminated_length": 510.0, "epoch": 0.8921302578018996, "frac_reward_zero_std": 0.5, "grad_norm": 0.0724111944437027, "kl": 0.0, "learning_rate": 5.503795721187025e-08, "loss": -0.0004, "num_tokens": 78432351.0, "reward": 1.2333333492279053, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.9333333373069763, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2630 }, { "completion_length": 869.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 1970.0, "completions/max_terminated_length": 1970.0, "completions/mean_length": 869.75, "completions/mean_terminated_length": 869.75, "completions/min_length": 358.0, "completions/min_terminated_length": 358.0, "epoch": 0.8924694708276798, "frac_reward_zero_std": 1.0, "grad_norm": 1.449867710334729e-07, "kl": 0.0, "learning_rate": 5.486542443064182e-08, "loss": 0.0, "num_tokens": 78455400.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2631 }, { "completion_length": 1542.5833740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 3615.0, "completions/mean_length": 2091.666748046875, "completions/mean_terminated_length": 1682.8182373046875, "completions/min_length": 591.0, "completions/min_terminated_length": 591.0, "epoch": 0.89280868385346, "frac_reward_zero_std": 0.5, "grad_norm": 0.3900226652622223, "kl": NaN, "learning_rate": 5.469289164941339e-08, "loss": -0.0367, "num_tokens": 78488359.0, "reward": 1.0916666984558105, "reward_std": 0.26536136865615845, "rewards/correctness_reward_func/mean": 0.8166666030883789, "rewards/correctness_reward_func/std": 0.27579087018966675, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 2632 }, { "completion_length": 2165.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4595.0, "completions/max_terminated_length": 4595.0, "completions/mean_length": 2165.33349609375, "completions/mean_terminated_length": 2165.33349609375, "completions/min_length": 583.0, "completions/min_terminated_length": 583.0, "epoch": 0.8931478968792401, "frac_reward_zero_std": 0.5, "grad_norm": 0.9237000346183777, "kl": 0.0, "learning_rate": 5.452035886818495e-08, "loss": 0.015, "num_tokens": 78526205.0, "reward": 1.133333444595337, "reward_std": 0.25819891691207886, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.3892494738101959, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2633 }, { "completion_length": 712.0000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1134.0, "completions/max_terminated_length": 1134.0, "completions/mean_length": 712.0, "completions/mean_terminated_length": 712.0, "completions/min_length": 459.0, "completions/min_terminated_length": 459.0, "epoch": 0.8934871099050203, "frac_reward_zero_std": 0.0, "grad_norm": 0.08076629787683487, "kl": 0.0, "learning_rate": 5.434782608695652e-08, "loss": 0.0008, "num_tokens": 78543497.0, "reward": 1.2166666984558105, "reward_std": 0.09246456623077393, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2634 }, { "completion_length": 706.5000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1162.0, "completions/max_terminated_length": 1162.0, "completions/mean_length": 706.5, "completions/mean_terminated_length": 706.5, "completions/min_length": 465.0, "completions/min_terminated_length": 465.0, "epoch": 0.8938263229308006, "frac_reward_zero_std": 0.5, "grad_norm": 0.3047957122325897, "kl": 0.0, "learning_rate": 5.417529330572808e-08, "loss": 0.0022, "num_tokens": 78566291.0, "reward": 0.8666666746139526, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.5666666626930237, "rewards/correctness_reward_func/std": 0.5033223032951355, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2635 }, { "completion_length": 507.00001525878906, "completions/clipped_ratio": 0.0, "completions/max_length": 818.0, "completions/max_terminated_length": 818.0, "completions/mean_length": 507.0, "completions/mean_terminated_length": 507.0, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.8941655359565808, "frac_reward_zero_std": 1.0, "grad_norm": 9.127612088377646e-08, "kl": 0.0, "learning_rate": 5.4002760524499656e-08, "loss": 0.0, "num_tokens": 78582875.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2636 }, { "completion_length": 1071.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 3252.0, "completions/max_terminated_length": 3252.0, "completions/mean_length": 1071.8333740234375, "completions/mean_terminated_length": 1071.8333740234375, "completions/min_length": 412.0, "completions/min_terminated_length": 412.0, "epoch": 0.8945047489823609, "frac_reward_zero_std": 0.0, "grad_norm": 0.4156602919101715, "kl": 0.0, "learning_rate": 5.383022774327122e-08, "loss": -0.012, "num_tokens": 78606981.0, "reward": 0.9666666984558105, "reward_std": 0.27072879672050476, "rewards/correctness_reward_func/mean": 0.6666666865348816, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2637 }, { "completion_length": 780.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1549.0, "completions/max_terminated_length": 1549.0, "completions/mean_length": 780.6666870117188, "completions/mean_terminated_length": 780.6666870117188, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 0.8948439620081411, "frac_reward_zero_std": 0.5, "grad_norm": 0.07908864319324493, "kl": 0.0, "learning_rate": 5.365769496204279e-08, "loss": 0.0005, "num_tokens": 78623339.0, "reward": 1.2333333492279053, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.9333333373069763, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2638 }, { "completion_length": 553.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 702.0, "completions/max_terminated_length": 702.0, "completions/mean_length": 553.8333740234375, "completions/mean_terminated_length": 553.8333740234375, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "epoch": 0.8951831750339213, "frac_reward_zero_std": 1.0, "grad_norm": 9.465839667655018e-08, "kl": 0.0, "learning_rate": 5.348516218081435e-08, "loss": 0.0, "num_tokens": 78644625.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2639 }, { "completion_length": 492.75, "completions/clipped_ratio": 0.0, "completions/max_length": 795.0, "completions/max_terminated_length": 795.0, "completions/mean_length": 492.75, "completions/mean_terminated_length": 492.75, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.8955223880597015, "frac_reward_zero_std": 0.5, "grad_norm": 0.04776453971862793, "kl": 0.0, "learning_rate": 5.331262939958592e-08, "loss": -0.0001, "num_tokens": 78663300.0, "reward": 0.7833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2640 }, { "completion_length": 2390.2500610351562, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5858.0, "completions/mean_length": 2939.33349609375, "completions/mean_terminated_length": 2607.545654296875, "completions/min_length": 616.0, "completions/min_terminated_length": 616.0, "epoch": 0.8958616010854816, "frac_reward_zero_std": 0.0, "grad_norm": 0.9246450662612915, "kl": NaN, "learning_rate": 5.3140096618357486e-08, "loss": -0.03, "num_tokens": 78701967.0, "reward": 0.949999988079071, "reward_std": 0.33648794889450073, "rewards/correctness_reward_func/mean": 0.699999988079071, "rewards/correctness_reward_func/std": 0.43064433336257935, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.09770084172487259, "step": 2641 }, { "completion_length": 729.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1284.0, "completions/max_terminated_length": 1284.0, "completions/mean_length": 729.75, "completions/mean_terminated_length": 729.75, "completions/min_length": 394.0, "completions/min_terminated_length": 394.0, "epoch": 0.8962008141112618, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 5.296756383712905e-08, "loss": 0.0, "num_tokens": 78725670.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2642 }, { "completion_length": 569.0000152587891, "completions/clipped_ratio": 0.0, "completions/max_length": 1042.0, "completions/max_terminated_length": 1042.0, "completions/mean_length": 569.0, "completions/mean_terminated_length": 569.0, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.8965400271370421, "frac_reward_zero_std": 1.0, "grad_norm": 1.4994584773830866e-07, "kl": 0.0, "learning_rate": 5.279503105590062e-08, "loss": 0.0, "num_tokens": 78744378.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2643 }, { "completion_length": 1238.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2678.0, "completions/max_terminated_length": 2678.0, "completions/mean_length": 1238.916748046875, "completions/mean_terminated_length": 1238.916748046875, "completions/min_length": 655.0, "completions/min_terminated_length": 655.0, "epoch": 0.8968792401628223, "frac_reward_zero_std": 0.5, "grad_norm": 0.06167903542518616, "kl": 0.0, "learning_rate": 5.2622498274672184e-08, "loss": -0.0002, "num_tokens": 78770801.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2644 }, { "completion_length": 1267.6666870117188, "completions/clipped_ratio": 0.5, "completions/max_length": 6589.0, "completions/max_terminated_length": 3442.0, "completions/mean_length": 4562.1669921875, "completions/mean_terminated_length": 2535.33349609375, "completions/min_length": 1446.0, "completions/min_terminated_length": 1446.0, "epoch": 0.8972184531886025, "frac_reward_zero_std": 0.0, "grad_norm": 0.16181054711341858, "kl": NaN, "learning_rate": 5.244996549344375e-08, "loss": -0.0156, "num_tokens": 78799843.0, "reward": 0.16249999403953552, "reward_std": 0.12398675084114075, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.16250000894069672, "rewards/format_reward_func/std": 0.14943073689937592, "step": 2645 }, { "completion_length": 1021.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 2133.0, "completions/max_terminated_length": 2133.0, "completions/mean_length": 1021.25, "completions/mean_terminated_length": 1021.25, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "epoch": 0.8975576662143826, "frac_reward_zero_std": 0.0, "grad_norm": 0.4140142798423767, "kl": 0.0, "learning_rate": 5.2277432712215316e-08, "loss": 0.0046, "num_tokens": 78821362.0, "reward": 1.1166666746139526, "reward_std": 0.29902368783950806, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.3857302963733673, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2646 }, { "completion_length": 697.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 954.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 697.25, "completions/mean_terminated_length": 697.25, "completions/min_length": 444.0, "completions/min_terminated_length": 444.0, "epoch": 0.8978968792401628, "frac_reward_zero_std": 0.5, "grad_norm": 0.08367545157670975, "kl": 0.0, "learning_rate": 5.210489993098688e-08, "loss": 0.0003, "num_tokens": 78843433.0, "reward": 0.7333333492279053, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.43333330750465393, "rewards/correctness_reward_func/std": 0.45792683959007263, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2647 }, { "completion_length": 671.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 965.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 671.75, "completions/mean_terminated_length": 671.75, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.898236092265943, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 5.1932367149758455e-08, "loss": 0.0, "num_tokens": 78866290.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2648 }, { "completion_length": 1995.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4270.0, "completions/max_terminated_length": 4270.0, "completions/mean_length": 1995.8333740234375, "completions/mean_terminated_length": 1995.8333740234375, "completions/min_length": 587.0, "completions/min_terminated_length": 587.0, "epoch": 0.8985753052917232, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 5.175983436853002e-08, "loss": 0.0, "num_tokens": 78903884.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2649 }, { "completion_length": 1692.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 5450.0, "completions/max_terminated_length": 5450.0, "completions/mean_length": 1692.0833740234375, "completions/mean_terminated_length": 1692.0833740234375, "completions/min_length": 417.0, "completions/min_terminated_length": 417.0, "epoch": 0.8989145183175034, "frac_reward_zero_std": 1.0, "grad_norm": 1.3402505771864526e-07, "kl": 0.0, "learning_rate": 5.158730158730158e-08, "loss": 0.0, "num_tokens": 78936387.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2650 }, { "completion_length": 874.0000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1299.0, "completions/max_terminated_length": 1299.0, "completions/mean_length": 874.0, "completions/mean_terminated_length": 874.0, "completions/min_length": 519.0, "completions/min_terminated_length": 519.0, "epoch": 0.8992537313432836, "frac_reward_zero_std": 0.5, "grad_norm": 0.09666566550731659, "kl": 0.0, "learning_rate": 5.141476880607315e-08, "loss": 0.001, "num_tokens": 78958209.0, "reward": 1.2333333492279053, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.9333333373069763, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2651 }, { "completion_length": 1136.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2302.0, "completions/max_terminated_length": 2302.0, "completions/mean_length": 1136.916748046875, "completions/mean_terminated_length": 1136.916748046875, "completions/min_length": 390.0, "completions/min_terminated_length": 390.0, "epoch": 0.8995929443690638, "frac_reward_zero_std": 1.0, "grad_norm": 1.2993388054383104e-07, "kl": 0.0, "learning_rate": 5.124223602484472e-08, "loss": 0.0, "num_tokens": 78983576.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2652 }, { "completion_length": 847.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1757.0, "completions/max_terminated_length": 1757.0, "completions/mean_length": 847.3333740234375, "completions/mean_terminated_length": 847.3333740234375, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "epoch": 0.899932157394844, "frac_reward_zero_std": 0.5, "grad_norm": 0.09100963175296783, "kl": 0.0, "learning_rate": 5.1069703243616285e-08, "loss": -0.0005, "num_tokens": 79009476.0, "reward": 1.2666666507720947, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2653 }, { "completion_length": 894.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1268.0, "completions/max_terminated_length": 1268.0, "completions/mean_length": 894.6666870117188, "completions/mean_terminated_length": 894.6666870117188, "completions/min_length": 515.0, "completions/min_terminated_length": 515.0, "epoch": 0.9002713704206241, "frac_reward_zero_std": 0.0, "grad_norm": 0.34952783584594727, "kl": 0.0, "learning_rate": 5.089717046238785e-08, "loss": -0.0017, "num_tokens": 79032914.0, "reward": 1.066666841506958, "reward_std": 0.21493908762931824, "rewards/correctness_reward_func/mean": 0.7666667103767395, "rewards/correctness_reward_func/std": 0.2534608840942383, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2654 }, { "completion_length": 1080.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2221.0, "completions/max_terminated_length": 2221.0, "completions/mean_length": 1080.0, "completions/mean_terminated_length": 1080.0, "completions/min_length": 577.0, "completions/min_terminated_length": 577.0, "epoch": 0.9006105834464043, "frac_reward_zero_std": 0.5, "grad_norm": 0.551978588104248, "kl": 0.0, "learning_rate": 5.0724637681159424e-08, "loss": 0.0117, "num_tokens": 79054736.0, "reward": 1.1166667938232422, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.8166666030883789, "rewards/correctness_reward_func/std": 0.27579087018966675, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2655 }, { "completion_length": 819.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1719.0, "completions/max_terminated_length": 1719.0, "completions/mean_length": 819.6666870117188, "completions/mean_terminated_length": 819.6666870117188, "completions/min_length": 490.0, "completions/min_terminated_length": 490.0, "epoch": 0.9009497964721845, "frac_reward_zero_std": 0.5, "grad_norm": 0.45870906114578247, "kl": 0.0, "learning_rate": 5.0552104899930984e-08, "loss": 0.0093, "num_tokens": 79074070.0, "reward": 1.0, "reward_std": 0.21908903121948242, "rewards/correctness_reward_func/mean": 0.699999988079071, "rewards/correctness_reward_func/std": 0.43064433336257935, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2656 }, { "completion_length": 977.5833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1600.0, "completions/max_terminated_length": 1600.0, "completions/mean_length": 977.5833740234375, "completions/mean_terminated_length": 977.5833740234375, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "epoch": 0.9012890094979648, "frac_reward_zero_std": 1.0, "grad_norm": 1.584904367746276e-07, "kl": 0.0, "learning_rate": 5.037957211870255e-08, "loss": 0.0, "num_tokens": 79099745.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2657 }, { "completion_length": 611.1666717529297, "completions/clipped_ratio": 0.0, "completions/max_length": 1229.0, "completions/max_terminated_length": 1229.0, "completions/mean_length": 611.1666870117188, "completions/mean_terminated_length": 611.1666870117188, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "epoch": 0.9016282225237449, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 5.0207039337474116e-08, "loss": 0.0, "num_tokens": 79117453.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2658 }, { "completion_length": 727.5833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1514.0, "completions/max_terminated_length": 1514.0, "completions/mean_length": 727.5833740234375, "completions/mean_terminated_length": 727.5833740234375, "completions/min_length": 370.0, "completions/min_terminated_length": 370.0, "epoch": 0.9019674355495251, "frac_reward_zero_std": 0.0, "grad_norm": 0.4461038410663605, "kl": 0.0, "learning_rate": 5.003450655624569e-08, "loss": 0.0058, "num_tokens": 79140650.0, "reward": 1.133333444595337, "reward_std": 0.25163978338241577, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.28069180250167847, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2659 }, { "completion_length": 1174.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2410.0, "completions/max_terminated_length": 2410.0, "completions/mean_length": 1174.5, "completions/mean_terminated_length": 1174.5, "completions/min_length": 570.0, "completions/min_terminated_length": 570.0, "epoch": 0.9023066485753053, "frac_reward_zero_std": 1.0, "grad_norm": 2.01942057742599e-07, "kl": 0.0, "learning_rate": 4.9861973775017255e-08, "loss": 0.0, "num_tokens": 79169564.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2660 }, { "completion_length": 562.5000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 943.0, "completions/max_terminated_length": 943.0, "completions/mean_length": 562.5, "completions/mean_terminated_length": 562.5, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.9026458616010855, "frac_reward_zero_std": 1.0, "grad_norm": 1.1696494794932732e-07, "kl": 0.0, "learning_rate": 4.9689440993788814e-08, "loss": 0.0, "num_tokens": 79190162.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2661 }, { "completion_length": 738.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1360.0, "completions/max_terminated_length": 1360.0, "completions/mean_length": 738.25, "completions/mean_terminated_length": 738.25, "completions/min_length": 503.0, "completions/min_terminated_length": 503.0, "epoch": 0.9029850746268657, "frac_reward_zero_std": 0.5, "grad_norm": 0.40984442830085754, "kl": 0.0, "learning_rate": 4.951690821256038e-08, "loss": 0.0011, "num_tokens": 79209743.0, "reward": 0.8666667342185974, "reward_std": 0.2581988573074341, "rewards/correctness_reward_func/mean": 0.5666666626930237, "rewards/correctness_reward_func/std": 0.42497774958610535, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2662 }, { "completion_length": 657.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1118.0, "completions/max_terminated_length": 1118.0, "completions/mean_length": 657.1666870117188, "completions/mean_terminated_length": 657.1666870117188, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.9033242876526458, "frac_reward_zero_std": 0.5, "grad_norm": 0.07391693443059921, "kl": 0.0, "learning_rate": 4.934437543133195e-08, "loss": -0.0001, "num_tokens": 79230895.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2663 }, { "completion_length": 1294.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3784.0, "completions/max_terminated_length": 3784.0, "completions/mean_length": 1294.25, "completions/mean_terminated_length": 1294.25, "completions/min_length": 585.0, "completions/min_terminated_length": 585.0, "epoch": 0.903663500678426, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.917184265010352e-08, "loss": 0.0, "num_tokens": 79259932.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2664 }, { "completion_length": 495.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 712.0, "completions/max_terminated_length": 712.0, "completions/mean_length": 495.8333435058594, "completions/mean_terminated_length": 495.8333435058594, "completions/min_length": 417.0, "completions/min_terminated_length": 417.0, "epoch": 0.9040027137042063, "frac_reward_zero_std": 1.0, "grad_norm": 7.691157577482954e-08, "kl": 0.0, "learning_rate": 4.8999309868875085e-08, "loss": 0.0, "num_tokens": 79277612.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2665 }, { "completion_length": 959.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2611.0, "completions/max_terminated_length": 2611.0, "completions/mean_length": 959.5833740234375, "completions/mean_terminated_length": 959.5833740234375, "completions/min_length": 507.0, "completions/min_terminated_length": 507.0, "epoch": 0.9043419267299865, "frac_reward_zero_std": 1.0, "grad_norm": 1.204231665496991e-07, "kl": 0.0, "learning_rate": 4.882677708764665e-08, "loss": 0.0, "num_tokens": 79304859.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2666 }, { "completion_length": 847.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1870.0, "completions/max_terminated_length": 1870.0, "completions/mean_length": 847.8333740234375, "completions/mean_terminated_length": 847.8333740234375, "completions/min_length": 469.0, "completions/min_terminated_length": 469.0, "epoch": 0.9046811397557666, "frac_reward_zero_std": 0.5, "grad_norm": 0.3793267011642456, "kl": 0.0, "learning_rate": 4.865424430641822e-08, "loss": 0.0005, "num_tokens": 79326223.0, "reward": 1.1166667938232422, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.8166666030883789, "rewards/correctness_reward_func/std": 0.27579087018966675, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2667 }, { "completion_length": 738.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1130.0, "completions/max_terminated_length": 1130.0, "completions/mean_length": 738.8333740234375, "completions/mean_terminated_length": 738.8333740234375, "completions/min_length": 435.0, "completions/min_terminated_length": 435.0, "epoch": 0.9050203527815468, "frac_reward_zero_std": 1.0, "grad_norm": 9.54229832927922e-08, "kl": 0.0, "learning_rate": 4.848171152518978e-08, "loss": 0.0, "num_tokens": 79341119.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2668 }, { "completion_length": 1402.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4277.0, "completions/max_terminated_length": 4277.0, "completions/mean_length": 1402.0833740234375, "completions/mean_terminated_length": 1402.0833740234375, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.905359565807327, "frac_reward_zero_std": 0.5, "grad_norm": 0.06256908923387527, "kl": 0.0, "learning_rate": 4.830917874396135e-08, "loss": -0.0007, "num_tokens": 79370496.0, "reward": 1.1500000953674316, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.8500000834465027, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2669 }, { "completion_length": 2518.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 6025.0, "completions/max_terminated_length": 6025.0, "completions/mean_length": 2518.666748046875, "completions/mean_terminated_length": 2518.666748046875, "completions/min_length": 540.0, "completions/min_terminated_length": 540.0, "epoch": 0.9056987788331072, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.8136645962732915e-08, "loss": 0.0, "num_tokens": 79408418.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2670 }, { "completion_length": 637.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 935.0, "completions/max_terminated_length": 935.0, "completions/mean_length": 637.1666870117188, "completions/mean_terminated_length": 637.1666870117188, "completions/min_length": 444.0, "completions/min_terminated_length": 444.0, "epoch": 0.9060379918588873, "frac_reward_zero_std": 1.0, "grad_norm": 1.4777671708543494e-07, "kl": 0.0, "learning_rate": 4.796411318150449e-08, "loss": 0.0, "num_tokens": 79430536.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2671 }, { "completion_length": 700.1666870117188, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 1306.0, "completions/mean_length": 1249.25, "completions/mean_terminated_length": 763.8181762695312, "completions/min_length": 445.0, "completions/min_terminated_length": 445.0, "epoch": 0.9063772048846676, "frac_reward_zero_std": 0.5, "grad_norm": 0.08373986929655075, "kl": NaN, "learning_rate": 4.7791580400276054e-08, "loss": -0.0032, "num_tokens": 79451118.0, "reward": 0.27500003576278687, "reward_std": 0.06123724579811096, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 2672 }, { "completion_length": 850.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1250.0, "completions/max_terminated_length": 1250.0, "completions/mean_length": 850.75, "completions/mean_terminated_length": 850.75, "completions/min_length": 673.0, "completions/min_terminated_length": 673.0, "epoch": 0.9067164179104478, "frac_reward_zero_std": 1.0, "grad_norm": 1.8840724180790858e-07, "kl": 0.0, "learning_rate": 4.7619047619047613e-08, "loss": 0.0, "num_tokens": 79473045.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2673 }, { "completion_length": 718.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1164.0, "completions/max_terminated_length": 1164.0, "completions/mean_length": 718.4166870117188, "completions/mean_terminated_length": 718.4166870117188, "completions/min_length": 577.0, "completions/min_terminated_length": 577.0, "epoch": 0.907055630936228, "frac_reward_zero_std": 0.5, "grad_norm": 0.07943646609783173, "kl": 0.0, "learning_rate": 4.744651483781918e-08, "loss": 0.0001, "num_tokens": 79495496.0, "reward": 1.2333333492279053, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.9333333373069763, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2674 }, { "completion_length": 742.5000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1397.0, "completions/max_terminated_length": 1397.0, "completions/mean_length": 742.5, "completions/mean_terminated_length": 742.5, "completions/min_length": 404.0, "completions/min_terminated_length": 404.0, "epoch": 0.9073948439620081, "frac_reward_zero_std": 0.0, "grad_norm": 0.3805397152900696, "kl": 0.0, "learning_rate": 4.727398205659075e-08, "loss": -0.0004, "num_tokens": 79515800.0, "reward": 1.183333396911621, "reward_std": 0.24082480370998383, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2675 }, { "completion_length": 1510.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3741.0, "completions/max_terminated_length": 3741.0, "completions/mean_length": 1510.916748046875, "completions/mean_terminated_length": 1510.916748046875, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.9077340569877883, "frac_reward_zero_std": 0.5, "grad_norm": 0.81682950258255, "kl": 0.0, "learning_rate": 4.710144927536232e-08, "loss": 0.0285, "num_tokens": 79547749.0, "reward": 1.0499999523162842, "reward_std": 0.27386125922203064, "rewards/correctness_reward_func/mean": 0.75, "rewards/correctness_reward_func/std": 0.45226702094078064, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2676 }, { "completion_length": 789.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1307.0, "completions/max_terminated_length": 1307.0, "completions/mean_length": 789.25, "completions/mean_terminated_length": 789.25, "completions/min_length": 498.0, "completions/min_terminated_length": 498.0, "epoch": 0.9080732700135685, "frac_reward_zero_std": 1.0, "grad_norm": 8.661001515974931e-08, "kl": 0.0, "learning_rate": 4.6928916494133884e-08, "loss": 0.0, "num_tokens": 79564792.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2677 }, { "completion_length": 603.5833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 870.0, "completions/max_terminated_length": 870.0, "completions/mean_length": 603.5833740234375, "completions/mean_terminated_length": 603.5833740234375, "completions/min_length": 468.0, "completions/min_terminated_length": 468.0, "epoch": 0.9084124830393487, "frac_reward_zero_std": 0.5, "grad_norm": 0.06758517026901245, "kl": 0.0, "learning_rate": 4.675638371290545e-08, "loss": 0.0006, "num_tokens": 79582703.0, "reward": 1.133333444595337, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2678 }, { "completion_length": 985.0000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 3148.0, "completions/max_terminated_length": 3148.0, "completions/mean_length": 985.0, "completions/mean_terminated_length": 985.0, "completions/min_length": 410.0, "completions/min_terminated_length": 410.0, "epoch": 0.908751696065129, "frac_reward_zero_std": 1.0, "grad_norm": 1.2997078613352642e-07, "kl": 0.0, "learning_rate": 4.6583850931677016e-08, "loss": 0.0, "num_tokens": 79607981.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2679 }, { "completion_length": 1385.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4029.0, "completions/max_terminated_length": 4029.0, "completions/mean_length": 1385.916748046875, "completions/mean_terminated_length": 1385.916748046875, "completions/min_length": 629.0, "completions/min_terminated_length": 629.0, "epoch": 0.9090909090909091, "frac_reward_zero_std": 0.5, "grad_norm": 0.3527598977088928, "kl": 0.0, "learning_rate": 4.641131815044858e-08, "loss": 0.0055, "num_tokens": 79639096.0, "reward": 1.1500000953674316, "reward_std": 0.17606817185878754, "rewards/correctness_reward_func/mean": 0.8499999642372131, "rewards/correctness_reward_func/std": 0.28444525599479675, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2680 }, { "completion_length": 1330.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4747.0, "completions/max_terminated_length": 4747.0, "completions/mean_length": 1330.0833740234375, "completions/mean_terminated_length": 1330.0833740234375, "completions/min_length": 441.0, "completions/min_terminated_length": 441.0, "epoch": 0.9094301221166893, "frac_reward_zero_std": 0.0, "grad_norm": 0.5836919546127319, "kl": 0.0, "learning_rate": 4.623878536922015e-08, "loss": -0.0178, "num_tokens": 79667855.0, "reward": 0.9833333492279053, "reward_std": 0.3129711151123047, "rewards/correctness_reward_func/mean": 0.6833333373069763, "rewards/correctness_reward_func/std": 0.4217568039894104, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2681 }, { "completion_length": 1216.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2280.0, "completions/max_terminated_length": 2280.0, "completions/mean_length": 1216.0, "completions/mean_terminated_length": 1216.0, "completions/min_length": 531.0, "completions/min_terminated_length": 531.0, "epoch": 0.9097693351424695, "frac_reward_zero_std": 1.0, "grad_norm": 1.0221882718042252e-07, "kl": 0.0, "learning_rate": 4.6066252587991715e-08, "loss": 0.0, "num_tokens": 79696175.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2682 }, { "completion_length": 793.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1166.0, "completions/max_terminated_length": 1166.0, "completions/mean_length": 793.9166870117188, "completions/mean_terminated_length": 793.9166870117188, "completions/min_length": 420.0, "completions/min_terminated_length": 420.0, "epoch": 0.9101085481682497, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.589371980676329e-08, "loss": 0.0, "num_tokens": 79718248.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2683 }, { "completion_length": 1111.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2933.0, "completions/max_terminated_length": 2933.0, "completions/mean_length": 1111.916748046875, "completions/mean_terminated_length": 1111.916748046875, "completions/min_length": 577.0, "completions/min_terminated_length": 577.0, "epoch": 0.9104477611940298, "frac_reward_zero_std": 0.5, "grad_norm": 0.5338830947875977, "kl": 0.0, "learning_rate": 4.572118702553485e-08, "loss": 0.0077, "num_tokens": 79744341.0, "reward": 1.133333444595337, "reward_std": 0.25819891691207886, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.3892494738101959, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2684 }, { "completion_length": 2008.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3539.0, "completions/max_terminated_length": 3539.0, "completions/mean_length": 2008.5, "completions/mean_terminated_length": 2008.5, "completions/min_length": 1026.0, "completions/min_terminated_length": 1026.0, "epoch": 0.91078697421981, "frac_reward_zero_std": 0.5, "grad_norm": 0.13771429657936096, "kl": 0.0, "learning_rate": 4.554865424430641e-08, "loss": 0.0008, "num_tokens": 79780395.0, "reward": 1.1666667461395264, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2685 }, { "completion_length": 517.0000152587891, "completions/clipped_ratio": 0.0, "completions/max_length": 808.0, "completions/max_terminated_length": 808.0, "completions/mean_length": 517.0, "completions/mean_terminated_length": 517.0, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "epoch": 0.9111261872455902, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.537612146307798e-08, "loss": 0.0, "num_tokens": 79798179.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2686 }, { "completion_length": 933.0000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1073.0, "completions/max_terminated_length": 1073.0, "completions/mean_length": 933.0, "completions/mean_terminated_length": 933.0, "completions/min_length": 776.0, "completions/min_terminated_length": 776.0, "epoch": 0.9114654002713705, "frac_reward_zero_std": 1.0, "grad_norm": 1.4549084426107584e-07, "kl": 0.0, "learning_rate": 4.520358868184955e-08, "loss": 0.0, "num_tokens": 79819581.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2687 }, { "completion_length": 801.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1445.0, "completions/max_terminated_length": 1445.0, "completions/mean_length": 801.3333740234375, "completions/mean_terminated_length": 801.3333740234375, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 0.9118046132971506, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.503105590062112e-08, "loss": 0.0, "num_tokens": 79837603.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2688 }, { "completion_length": 578.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 886.0, "completions/max_terminated_length": 886.0, "completions/mean_length": 578.75, "completions/mean_terminated_length": 578.75, "completions/min_length": 363.0, "completions/min_terminated_length": 363.0, "epoch": 0.9121438263229308, "frac_reward_zero_std": 0.5, "grad_norm": 0.06709299236536026, "kl": 0.0, "learning_rate": 4.4858523119392684e-08, "loss": 0.0, "num_tokens": 79855918.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2689 }, { "completion_length": 1781.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2838.0, "completions/max_terminated_length": 2838.0, "completions/mean_length": 1781.3333740234375, "completions/mean_terminated_length": 1781.3333740234375, "completions/min_length": 997.0, "completions/min_terminated_length": 997.0, "epoch": 0.912483039348711, "frac_reward_zero_std": 0.5, "grad_norm": 0.0879731997847557, "kl": 0.0, "learning_rate": 4.468599033816424e-08, "loss": 0.0022, "num_tokens": 79888298.0, "reward": 0.7166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2690 }, { "completion_length": 881.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1644.0, "completions/max_terminated_length": 1644.0, "completions/mean_length": 881.5833740234375, "completions/mean_terminated_length": 881.5833740234375, "completions/min_length": 432.0, "completions/min_terminated_length": 432.0, "epoch": 0.9128222523744912, "frac_reward_zero_std": 0.5, "grad_norm": 0.10760144144296646, "kl": 0.0, "learning_rate": 4.4513457556935816e-08, "loss": 0.0027, "num_tokens": 79913487.0, "reward": 1.25, "reward_std": 0.05477222427725792, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2691 }, { "completion_length": 1875.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4261.0, "completions/max_terminated_length": 4261.0, "completions/mean_length": 1875.416748046875, "completions/mean_terminated_length": 1875.416748046875, "completions/min_length": 490.0, "completions/min_terminated_length": 490.0, "epoch": 0.9131614654002713, "frac_reward_zero_std": 0.5, "grad_norm": 0.35787227749824524, "kl": 0.0, "learning_rate": 4.434092477570738e-08, "loss": 0.007, "num_tokens": 79947506.0, "reward": 1.183333396911621, "reward_std": 0.19407901167869568, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2692 }, { "completion_length": 668.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1214.0, "completions/max_terminated_length": 1214.0, "completions/mean_length": 668.6666870117188, "completions/mean_terminated_length": 668.6666870117188, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 0.9135006784260515, "frac_reward_zero_std": 0.5, "grad_norm": 0.06021340563893318, "kl": 0.0, "learning_rate": 4.416839199447895e-08, "loss": -0.0001, "num_tokens": 79969288.0, "reward": 1.2625000476837158, "reward_std": 0.04107918590307236, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.06784005463123322, "step": 2693 }, { "completion_length": 1166.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 2208.0, "completions/max_terminated_length": 2208.0, "completions/mean_length": 1166.166748046875, "completions/mean_terminated_length": 1166.166748046875, "completions/min_length": 637.0, "completions/min_terminated_length": 637.0, "epoch": 0.9138398914518318, "frac_reward_zero_std": 0.5, "grad_norm": 0.08496204763650894, "kl": 0.0, "learning_rate": 4.399585921325052e-08, "loss": 0.0003, "num_tokens": 79993986.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2694 }, { "completion_length": 821.5000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1263.0, "completions/max_terminated_length": 1263.0, "completions/mean_length": 821.5, "completions/mean_terminated_length": 821.5, "completions/min_length": 505.0, "completions/min_terminated_length": 505.0, "epoch": 0.914179104477612, "frac_reward_zero_std": 0.0, "grad_norm": 0.12137144058942795, "kl": 0.0, "learning_rate": 4.382332643202209e-08, "loss": -0.0002, "num_tokens": 80015376.0, "reward": 1.183333396911621, "reward_std": 0.09246458858251572, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2695 }, { "completion_length": 831.5833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1536.0, "completions/max_terminated_length": 1536.0, "completions/mean_length": 831.5833740234375, "completions/mean_terminated_length": 831.5833740234375, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.9145183175033921, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.3650793650793646e-08, "loss": 0.0, "num_tokens": 80034757.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2696 }, { "completion_length": 688.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1866.0, "completions/max_terminated_length": 1866.0, "completions/mean_length": 688.0, "completions/mean_terminated_length": 688.0, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "epoch": 0.9148575305291723, "frac_reward_zero_std": 1.0, "grad_norm": 1.3347472815894434e-07, "kl": 0.0, "learning_rate": 4.347826086956521e-08, "loss": 0.0, "num_tokens": 80052877.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2697 }, { "completion_length": 700.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 993.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 700.6666870117188, "completions/mean_terminated_length": 700.6666870117188, "completions/min_length": 485.0, "completions/min_terminated_length": 485.0, "epoch": 0.9151967435549525, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.3305728088336785e-08, "loss": 0.0, "num_tokens": 80072433.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2698 }, { "completion_length": 1003.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1480.0, "completions/max_terminated_length": 1480.0, "completions/mean_length": 1003.3333740234375, "completions/mean_terminated_length": 1003.3333740234375, "completions/min_length": 589.0, "completions/min_terminated_length": 589.0, "epoch": 0.9155359565807327, "frac_reward_zero_std": 1.0, "grad_norm": 1.0014473872388407e-07, "kl": 0.0, "learning_rate": 4.313319530710835e-08, "loss": 0.0, "num_tokens": 80092675.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2699 }, { "completion_length": 816.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1338.0, "completions/max_terminated_length": 1338.0, "completions/mean_length": 816.3333740234375, "completions/mean_terminated_length": 816.3333740234375, "completions/min_length": 412.0, "completions/min_terminated_length": 412.0, "epoch": 0.9158751696065129, "frac_reward_zero_std": 0.5, "grad_norm": 0.07577073574066162, "kl": 0.0, "learning_rate": 4.296066252587992e-08, "loss": -0.0003, "num_tokens": 80113181.0, "reward": 0.7833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2700 }, { "completion_length": 1375.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3865.0, "completions/max_terminated_length": 3865.0, "completions/mean_length": 1375.3333740234375, "completions/mean_terminated_length": 1375.3333740234375, "completions/min_length": 665.0, "completions/min_terminated_length": 665.0, "epoch": 0.916214382632293, "frac_reward_zero_std": 0.5, "grad_norm": 0.10727521032094955, "kl": 0.0, "learning_rate": 4.2788129744651477e-08, "loss": -0.0058, "num_tokens": 80143983.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2701 }, { "completion_length": 771.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1676.0, "completions/max_terminated_length": 1676.0, "completions/mean_length": 771.25, "completions/mean_terminated_length": 771.25, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "epoch": 0.9165535956580733, "frac_reward_zero_std": 0.5, "grad_norm": 0.09355942904949188, "kl": 0.0, "learning_rate": 4.261559696342305e-08, "loss": 0.0015, "num_tokens": 80166768.0, "reward": 1.2333333492279053, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.9333333373069763, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2702 }, { "completion_length": 919.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1533.0, "completions/max_terminated_length": 1533.0, "completions/mean_length": 919.5833740234375, "completions/mean_terminated_length": 919.5833740234375, "completions/min_length": 530.0, "completions/min_terminated_length": 530.0, "epoch": 0.9168928086838535, "frac_reward_zero_std": 1.0, "grad_norm": 1.5238586570376356e-07, "kl": 0.0, "learning_rate": 4.2443064182194615e-08, "loss": 0.0, "num_tokens": 80191495.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2703 }, { "completion_length": 900.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1522.0, "completions/max_terminated_length": 1522.0, "completions/mean_length": 900.9166870117188, "completions/mean_terminated_length": 900.9166870117188, "completions/min_length": 608.0, "completions/min_terminated_length": 608.0, "epoch": 0.9172320217096337, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.227053140096618e-08, "loss": 0.0, "num_tokens": 80217474.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2704 }, { "completion_length": 1015.5833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1848.0, "completions/max_terminated_length": 1848.0, "completions/mean_length": 1015.5833740234375, "completions/mean_terminated_length": 1015.5833740234375, "completions/min_length": 502.0, "completions/min_terminated_length": 502.0, "epoch": 0.9175712347354138, "frac_reward_zero_std": 0.5, "grad_norm": 0.3782864511013031, "kl": 0.0, "learning_rate": 4.209799861973775e-08, "loss": 0.0001, "num_tokens": 80239201.0, "reward": 0.8500000834465027, "reward_std": 0.2345207929611206, "rewards/correctness_reward_func/mean": 0.550000011920929, "rewards/correctness_reward_func/std": 0.4100997745990753, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2705 }, { "completion_length": 731.0, "completions/clipped_ratio": 0.0, "completions/max_length": 897.0, "completions/max_terminated_length": 897.0, "completions/mean_length": 731.0, "completions/mean_terminated_length": 731.0, "completions/min_length": 495.0, "completions/min_terminated_length": 495.0, "epoch": 0.917910447761194, "frac_reward_zero_std": 1.0, "grad_norm": 1.2408168004185427e-07, "kl": 0.0, "learning_rate": 4.192546583850932e-08, "loss": 0.0, "num_tokens": 80257153.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2706 }, { "completion_length": 736.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1138.0, "completions/max_terminated_length": 1138.0, "completions/mean_length": 736.1666870117188, "completions/mean_terminated_length": 736.1666870117188, "completions/min_length": 559.0, "completions/min_terminated_length": 559.0, "epoch": 0.9182496607869742, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.175293305728088e-08, "loss": 0.0, "num_tokens": 80280129.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2707 }, { "completion_length": 1669.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4251.0, "completions/max_terminated_length": 4251.0, "completions/mean_length": 1669.8333740234375, "completions/mean_terminated_length": 1669.8333740234375, "completions/min_length": 449.0, "completions/min_terminated_length": 449.0, "epoch": 0.9185888738127544, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.1580400276052446e-08, "loss": 0.0, "num_tokens": 80313637.0, "reward": 0.30000001192092896, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2708 }, { "completion_length": 712.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1032.0, "completions/max_terminated_length": 1032.0, "completions/mean_length": 712.25, "completions/mean_terminated_length": 712.25, "completions/min_length": 562.0, "completions/min_terminated_length": 562.0, "epoch": 0.9189280868385346, "frac_reward_zero_std": 1.0, "grad_norm": 1.966378277984404e-07, "kl": 0.0, "learning_rate": 4.140786749482401e-08, "loss": 0.0, "num_tokens": 80334184.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2709 }, { "completion_length": 782.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1392.0, "completions/max_terminated_length": 1392.0, "completions/mean_length": 782.75, "completions/mean_terminated_length": 782.75, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "epoch": 0.9192672998643148, "frac_reward_zero_std": 1.0, "grad_norm": 9.387144217498644e-08, "kl": 0.0, "learning_rate": 4.1235334713595584e-08, "loss": 0.0, "num_tokens": 80355349.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2710 }, { "completion_length": 2226.0001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 6204.0, "completions/max_terminated_length": 6204.0, "completions/mean_length": 2226.0, "completions/mean_terminated_length": 2226.0, "completions/min_length": 439.0, "completions/min_terminated_length": 439.0, "epoch": 0.919606512890095, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.106280193236715e-08, "loss": 0.0, "num_tokens": 80397595.0, "reward": 0.30000001192092896, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2711 }, { "completion_length": 740.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1139.0, "completions/max_terminated_length": 1139.0, "completions/mean_length": 740.75, "completions/mean_terminated_length": 740.75, "completions/min_length": 489.0, "completions/min_terminated_length": 489.0, "epoch": 0.9199457259158752, "frac_reward_zero_std": 1.0, "grad_norm": 9.332496375691335e-08, "kl": 0.0, "learning_rate": 4.0890269151138716e-08, "loss": 0.0, "num_tokens": 80418592.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2712 }, { "completion_length": 500.6666717529297, "completions/clipped_ratio": 0.0, "completions/max_length": 653.0, "completions/max_terminated_length": 653.0, "completions/mean_length": 500.66668701171875, "completions/mean_terminated_length": 500.66668701171875, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 0.9202849389416553, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.0717736369910276e-08, "loss": 0.0, "num_tokens": 80435448.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2713 }, { "completion_length": 886.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 2336.0, "completions/max_terminated_length": 2336.0, "completions/mean_length": 886.8333740234375, "completions/mean_terminated_length": 886.8333740234375, "completions/min_length": 439.0, "completions/min_terminated_length": 439.0, "epoch": 0.9206241519674355, "frac_reward_zero_std": 0.5, "grad_norm": 0.5227839350700378, "kl": 0.0, "learning_rate": 4.054520358868185e-08, "loss": -0.0006, "num_tokens": 80453638.0, "reward": 0.8833333849906921, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.5833333134651184, "rewards/correctness_reward_func/std": 0.5149286389350891, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2714 }, { "completion_length": 962.9167175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 1921.0, "completions/max_terminated_length": 1921.0, "completions/mean_length": 962.9166870117188, "completions/mean_terminated_length": 962.9166870117188, "completions/min_length": 508.0, "completions/min_terminated_length": 508.0, "epoch": 0.9209633649932157, "frac_reward_zero_std": 0.5, "grad_norm": 0.4399639368057251, "kl": 0.0, "learning_rate": 4.0372670807453415e-08, "loss": 0.0005, "num_tokens": 80476587.0, "reward": 1.2166666984558105, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2715 }, { "completion_length": 409.0833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 694.0, "completions/max_terminated_length": 694.0, "completions/mean_length": 409.0833435058594, "completions/mean_terminated_length": 409.0833435058594, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.921302578018996, "frac_reward_zero_std": 1.0, "grad_norm": 1.0568281538780866e-07, "kl": 0.0, "learning_rate": 4.020013802622498e-08, "loss": 0.0, "num_tokens": 80491054.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2716 }, { "completion_length": 629.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1020.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 629.3333740234375, "completions/mean_terminated_length": 629.3333740234375, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.9216417910447762, "frac_reward_zero_std": 0.5, "grad_norm": 0.08993557095527649, "kl": 0.0, "learning_rate": 4.002760524499655e-08, "loss": -0.0003, "num_tokens": 80514146.0, "reward": 0.75, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.45000001788139343, "rewards/correctness_reward_func/std": 0.47577688097953796, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2717 }, { "completion_length": 1899.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 4432.0, "completions/max_terminated_length": 4432.0, "completions/mean_length": 1899.166748046875, "completions/mean_terminated_length": 1899.166748046875, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "epoch": 0.9219810040705563, "frac_reward_zero_std": 0.5, "grad_norm": 0.7180707454681396, "kl": 0.0, "learning_rate": 3.985507246376811e-08, "loss": 0.0219, "num_tokens": 80547094.0, "reward": 0.6333333253860474, "reward_std": 0.25819891691207886, "rewards/correctness_reward_func/mean": 0.3333333432674408, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2718 }, { "completion_length": 1973.25, "completions/clipped_ratio": 0.0, "completions/max_length": 4633.0, "completions/max_terminated_length": 4633.0, "completions/mean_length": 1973.25, "completions/mean_terminated_length": 1973.25, "completions/min_length": 872.0, "completions/min_terminated_length": 872.0, "epoch": 0.9223202170963365, "frac_reward_zero_std": 0.5, "grad_norm": 0.5680398941040039, "kl": 0.0, "learning_rate": 3.968253968253968e-08, "loss": 0.0031, "num_tokens": 80583805.0, "reward": 0.5666667222976685, "reward_std": 0.20655912160873413, "rewards/correctness_reward_func/mean": 0.2666666805744171, "rewards/correctness_reward_func/std": 0.393892765045166, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2719 }, { "completion_length": 2069.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 5722.0, "completions/max_terminated_length": 5722.0, "completions/mean_length": 2069.58349609375, "completions/mean_terminated_length": 2069.58349609375, "completions/min_length": 533.0, "completions/min_terminated_length": 533.0, "epoch": 0.9226594301221167, "frac_reward_zero_std": 0.0, "grad_norm": 0.6459799408912659, "kl": 0.0, "learning_rate": 3.9510006901311245e-08, "loss": 0.0, "num_tokens": 80621108.0, "reward": 1.1000001430511475, "reward_std": 0.21493908762931824, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.2696799635887146, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2720 }, { "completion_length": 906.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1319.0, "completions/max_terminated_length": 1319.0, "completions/mean_length": 906.75, "completions/mean_terminated_length": 906.75, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "epoch": 0.9229986431478969, "frac_reward_zero_std": 0.5, "grad_norm": 0.47164544463157654, "kl": 0.0, "learning_rate": 3.933747412008281e-08, "loss": -0.0042, "num_tokens": 80642063.0, "reward": 0.7166666984558105, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.5149286985397339, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2721 }, { "completion_length": 1531.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3480.0, "completions/max_terminated_length": 3480.0, "completions/mean_length": 1531.666748046875, "completions/mean_terminated_length": 1531.666748046875, "completions/min_length": 411.0, "completions/min_terminated_length": 411.0, "epoch": 0.923337856173677, "frac_reward_zero_std": 1.0, "grad_norm": 1.6845544337229512e-07, "kl": 0.0, "learning_rate": 3.9164941338854384e-08, "loss": 0.0, "num_tokens": 80669185.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2722 }, { "completion_length": 1805.0000610351562, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 5653.0, "completions/mean_length": 2903.166748046875, "completions/mean_terminated_length": 2166.0, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "epoch": 0.9236770691994572, "frac_reward_zero_std": 0.5, "grad_norm": 0.6275231242179871, "kl": NaN, "learning_rate": 3.899240855762595e-08, "loss": -0.0284, "num_tokens": 80699869.0, "reward": 0.8333333730697632, "reward_std": 0.24013882875442505, "rewards/correctness_reward_func/mean": 0.5833333134651184, "rewards/correctness_reward_func/std": 0.5149286389350891, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.11677484214305878, "step": 2723 }, { "completion_length": 1895.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4392.0, "completions/max_terminated_length": 4392.0, "completions/mean_length": 1895.5833740234375, "completions/mean_terminated_length": 1895.5833740234375, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "epoch": 0.9240162822252375, "frac_reward_zero_std": 0.5, "grad_norm": 0.09339850395917892, "kl": 0.0, "learning_rate": 3.881987577639751e-08, "loss": 0.0034, "num_tokens": 80736566.0, "reward": 1.2166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298573225736618, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2724 }, { "completion_length": 879.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1465.0, "completions/max_terminated_length": 1465.0, "completions/mean_length": 879.25, "completions/mean_terminated_length": 879.25, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "epoch": 0.9243554952510177, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.8647342995169075e-08, "loss": 0.0, "num_tokens": 80760647.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2725 }, { "completion_length": 1036.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2003.0, "completions/max_terminated_length": 2003.0, "completions/mean_length": 1036.0833740234375, "completions/mean_terminated_length": 1036.0833740234375, "completions/min_length": 454.0, "completions/min_terminated_length": 454.0, "epoch": 0.9246947082767978, "frac_reward_zero_std": 0.0, "grad_norm": 0.13546627759933472, "kl": 0.0, "learning_rate": 3.847481021394065e-08, "loss": -0.0002, "num_tokens": 80790228.0, "reward": 1.2000000476837158, "reward_std": 0.10954447835683823, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2726 }, { "completion_length": 886.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3237.0, "completions/max_terminated_length": 3237.0, "completions/mean_length": 886.1666870117188, "completions/mean_terminated_length": 886.1666870117188, "completions/min_length": 396.0, "completions/min_terminated_length": 396.0, "epoch": 0.925033921302578, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.8302277432712214e-08, "loss": 0.0, "num_tokens": 80811884.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2727 }, { "completion_length": 963.6667175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 1700.0, "completions/max_terminated_length": 1700.0, "completions/mean_length": 963.6666870117188, "completions/mean_terminated_length": 963.6666870117188, "completions/min_length": 446.0, "completions/min_terminated_length": 446.0, "epoch": 0.9253731343283582, "frac_reward_zero_std": 0.5, "grad_norm": 0.0825958400964737, "kl": 0.0, "learning_rate": 3.812974465148378e-08, "loss": -0.0005, "num_tokens": 80832568.0, "reward": 0.7333333492279053, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.43333330750465393, "rewards/correctness_reward_func/std": 0.45792683959007263, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2728 }, { "completion_length": 2138.0833740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5584.0, "completions/mean_length": 2687.166748046875, "completions/mean_terminated_length": 2332.45458984375, "completions/min_length": 557.0, "completions/min_terminated_length": 557.0, "epoch": 0.9257123473541384, "frac_reward_zero_std": 0.0, "grad_norm": 0.7481763958930969, "kl": NaN, "learning_rate": 3.795721187025535e-08, "loss": -0.0308, "num_tokens": 80868677.0, "reward": 1.0416667461395264, "reward_std": 0.3523333966732025, "rewards/correctness_reward_func/mean": 0.7666666507720947, "rewards/correctness_reward_func/std": 0.3700941801071167, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 2729 }, { "completion_length": 1619.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3434.0, "completions/max_terminated_length": 3434.0, "completions/mean_length": 1619.3333740234375, "completions/mean_terminated_length": 1619.3333740234375, "completions/min_length": 644.0, "completions/min_terminated_length": 644.0, "epoch": 0.9260515603799185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.778467908902691e-08, "loss": 0.0, "num_tokens": 80905017.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2730 }, { "completion_length": 1135.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1952.0, "completions/max_terminated_length": 1952.0, "completions/mean_length": 1135.416748046875, "completions/mean_terminated_length": 1135.416748046875, "completions/min_length": 536.0, "completions/min_terminated_length": 536.0, "epoch": 0.9263907734056988, "frac_reward_zero_std": 0.5, "grad_norm": 0.4716922342777252, "kl": 0.0, "learning_rate": 3.761214630779848e-08, "loss": -0.0015, "num_tokens": 80929082.0, "reward": 0.8666667342185974, "reward_std": 0.25819891691207886, "rewards/correctness_reward_func/mean": 0.5666666626930237, "rewards/correctness_reward_func/std": 0.42497774958610535, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2731 }, { "completion_length": 901.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1398.0, "completions/max_terminated_length": 1398.0, "completions/mean_length": 901.25, "completions/mean_terminated_length": 901.25, "completions/min_length": 441.0, "completions/min_terminated_length": 441.0, "epoch": 0.926729986431479, "frac_reward_zero_std": 0.5, "grad_norm": 0.07383216172456741, "kl": 0.0, "learning_rate": 3.7439613526570044e-08, "loss": -0.0007, "num_tokens": 80950895.0, "reward": 0.7333333492279053, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.43333330750465393, "rewards/correctness_reward_func/std": 0.45792683959007263, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2732 }, { "completion_length": 522.6666717529297, "completions/clipped_ratio": 0.0, "completions/max_length": 849.0, "completions/max_terminated_length": 849.0, "completions/mean_length": 522.6666870117188, "completions/mean_terminated_length": 522.6666870117188, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.9270691994572592, "frac_reward_zero_std": 0.5, "grad_norm": 0.07973771542310715, "kl": 0.0, "learning_rate": 3.726708074534162e-08, "loss": -0.0002, "num_tokens": 80973145.0, "reward": 1.25, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2733 }, { "completion_length": 976.5000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 2236.0, "completions/max_terminated_length": 2236.0, "completions/mean_length": 976.5, "completions/mean_terminated_length": 976.5, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "epoch": 0.9274084124830394, "frac_reward_zero_std": 0.0, "grad_norm": 0.5184952020645142, "kl": 0.0, "learning_rate": 3.709454796411318e-08, "loss": 0.0086, "num_tokens": 80999677.0, "reward": 0.8500000834465027, "reward_std": 0.2473839521408081, "rewards/correctness_reward_func/mean": 0.550000011920929, "rewards/correctness_reward_func/std": 0.4100997745990753, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2734 }, { "completion_length": 3021.2501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 6545.0, "completions/max_terminated_length": 6545.0, "completions/mean_length": 3021.25, "completions/mean_terminated_length": 3021.25, "completions/min_length": 686.0, "completions/min_terminated_length": 686.0, "epoch": 0.9277476255088195, "frac_reward_zero_std": 0.5, "grad_norm": 0.08051112294197083, "kl": 0.0, "learning_rate": 3.692201518288475e-08, "loss": -0.0005, "num_tokens": 81049072.0, "reward": 0.7333333492279053, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.43333330750465393, "rewards/correctness_reward_func/std": 0.45792683959007263, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2735 }, { "completion_length": 1443.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3358.0, "completions/max_terminated_length": 3358.0, "completions/mean_length": 1443.666748046875, "completions/mean_terminated_length": 1443.666748046875, "completions/min_length": 678.0, "completions/min_terminated_length": 678.0, "epoch": 0.9280868385345997, "frac_reward_zero_std": 0.5, "grad_norm": 0.41724273562431335, "kl": 0.0, "learning_rate": 3.674948240165631e-08, "loss": -0.009, "num_tokens": 81077016.0, "reward": 1.1000001430511475, "reward_std": 0.20000001788139343, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.2696799635887146, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2736 }, { "completion_length": 1088.0833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1990.0, "completions/max_terminated_length": 1990.0, "completions/mean_length": 1088.0833740234375, "completions/mean_terminated_length": 1088.0833740234375, "completions/min_length": 468.0, "completions/min_terminated_length": 468.0, "epoch": 0.9284260515603799, "frac_reward_zero_std": 0.5, "grad_norm": 0.08445857465267181, "kl": 0.0, "learning_rate": 3.657694962042788e-08, "loss": 0.0007, "num_tokens": 81105709.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2737 }, { "completion_length": 664.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1452.0, "completions/max_terminated_length": 1452.0, "completions/mean_length": 664.75, "completions/mean_terminated_length": 664.75, "completions/min_length": 394.0, "completions/min_terminated_length": 394.0, "epoch": 0.9287652645861602, "frac_reward_zero_std": 0.5, "grad_norm": 0.10703444480895996, "kl": 0.0, "learning_rate": 3.640441683919945e-08, "loss": -0.0009, "num_tokens": 81127246.0, "reward": 1.2333333492279053, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.9333333373069763, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2738 }, { "completion_length": 685.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1251.0, "completions/max_terminated_length": 1251.0, "completions/mean_length": 685.0833740234375, "completions/mean_terminated_length": 685.0833740234375, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.9291044776119403, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.6231884057971014e-08, "loss": 0.0, "num_tokens": 81145553.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2739 }, { "completion_length": 1272.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3343.0, "completions/max_terminated_length": 3343.0, "completions/mean_length": 1272.166748046875, "completions/mean_terminated_length": 1272.166748046875, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 0.9294436906377205, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.605935127674258e-08, "loss": 0.0, "num_tokens": 81174271.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2740 }, { "completion_length": 636.0000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 981.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 636.0, "completions/mean_terminated_length": 636.0, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 0.9297829036635007, "frac_reward_zero_std": 0.5, "grad_norm": 0.08882219344377518, "kl": 0.0, "learning_rate": 3.5886818495514146e-08, "loss": -0.0004, "num_tokens": 81196237.0, "reward": 1.2666666507720947, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2741 }, { "completion_length": 2373.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3762.0, "completions/max_terminated_length": 3762.0, "completions/mean_length": 2373.58349609375, "completions/mean_terminated_length": 2373.58349609375, "completions/min_length": 1391.0, "completions/min_terminated_length": 1391.0, "epoch": 0.9301221166892809, "frac_reward_zero_std": 0.5, "grad_norm": 0.5399261116981506, "kl": 0.0, "learning_rate": 3.571428571428571e-08, "loss": -0.0211, "num_tokens": 81239264.0, "reward": 0.6000000238418579, "reward_std": 0.23664318025112152, "rewards/correctness_reward_func/mean": 0.29999998211860657, "rewards/correctness_reward_func/std": 0.4472135901451111, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2742 }, { "completion_length": 1588.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 4174.0, "completions/max_terminated_length": 4174.0, "completions/mean_length": 1588.5, "completions/mean_terminated_length": 1588.5, "completions/min_length": 428.0, "completions/min_terminated_length": 428.0, "epoch": 0.930461329715061, "frac_reward_zero_std": 1.0, "grad_norm": 1.572479106926039e-07, "kl": 0.0, "learning_rate": 3.554175293305728e-08, "loss": 0.0, "num_tokens": 81268532.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2743 }, { "completion_length": 1959.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4726.0, "completions/max_terminated_length": 4726.0, "completions/mean_length": 1959.8333740234375, "completions/mean_terminated_length": 1959.8333740234375, "completions/min_length": 424.0, "completions/min_terminated_length": 424.0, "epoch": 0.9308005427408412, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.5369220151828844e-08, "loss": 0.0, "num_tokens": 81301122.0, "reward": 0.30000001192092896, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2744 }, { "completion_length": 528.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 951.0, "completions/max_terminated_length": 951.0, "completions/mean_length": 528.25, "completions/mean_terminated_length": 528.25, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "epoch": 0.9311397557666214, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.5196687370600417e-08, "loss": 0.0, "num_tokens": 81318177.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2745 }, { "completion_length": 1553.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2822.0, "completions/max_terminated_length": 2822.0, "completions/mean_length": 1553.25, "completions/mean_terminated_length": 1553.25, "completions/min_length": 565.0, "completions/min_terminated_length": 565.0, "epoch": 0.9314789687924017, "frac_reward_zero_std": 1.0, "grad_norm": 1.6427061666490772e-07, "kl": 0.0, "learning_rate": 3.502415458937198e-08, "loss": 0.0, "num_tokens": 81350196.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2746 }, { "completion_length": 567.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 994.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 567.1666870117188, "completions/mean_terminated_length": 567.1666870117188, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "epoch": 0.9318181818181818, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.485162180814354e-08, "loss": 0.0, "num_tokens": 81369806.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2747 }, { "completion_length": 744.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1377.0, "completions/max_terminated_length": 1377.0, "completions/mean_length": 744.75, "completions/mean_terminated_length": 744.75, "completions/min_length": 421.0, "completions/min_terminated_length": 421.0, "epoch": 0.932157394843962, "frac_reward_zero_std": 0.5, "grad_norm": 0.08844713121652603, "kl": 0.0, "learning_rate": 3.467908902691511e-08, "loss": -0.0002, "num_tokens": 81392057.0, "reward": 0.7333333492279053, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.43333330750465393, "rewards/correctness_reward_func/std": 0.45792683959007263, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2748 }, { "completion_length": 1178.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2366.0, "completions/max_terminated_length": 2366.0, "completions/mean_length": 1178.5833740234375, "completions/mean_terminated_length": 1178.5833740234375, "completions/min_length": 596.0, "completions/min_terminated_length": 596.0, "epoch": 0.9324966078697422, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.450655624568668e-08, "loss": 0.0, "num_tokens": 81414570.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2749 }, { "completion_length": 548.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1251.0, "completions/max_terminated_length": 1251.0, "completions/mean_length": 548.4166870117188, "completions/mean_terminated_length": 548.4166870117188, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.9328358208955224, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.433402346445825e-08, "loss": 0.0, "num_tokens": 81435671.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2750 }, { "completion_length": 632.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 975.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 632.0833740234375, "completions/mean_terminated_length": 632.0833740234375, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "epoch": 0.9331750339213026, "frac_reward_zero_std": 0.5, "grad_norm": 0.08587653189897537, "kl": 0.0, "learning_rate": 3.416149068322981e-08, "loss": 0.0003, "num_tokens": 81453234.0, "reward": 1.1500000953674316, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.8500000834465027, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2751 }, { "completion_length": 967.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1243.0, "completions/max_terminated_length": 1243.0, "completions/mean_length": 967.3333740234375, "completions/mean_terminated_length": 967.3333740234375, "completions/min_length": 547.0, "completions/min_terminated_length": 547.0, "epoch": 0.9335142469470827, "frac_reward_zero_std": 1.0, "grad_norm": 1.4377275192600791e-07, "kl": 0.0, "learning_rate": 3.398895790200138e-08, "loss": 0.0, "num_tokens": 81476266.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2752 }, { "completion_length": 1580.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 2406.0, "completions/max_terminated_length": 2406.0, "completions/mean_length": 1580.416748046875, "completions/mean_terminated_length": 1580.416748046875, "completions/min_length": 956.0, "completions/min_terminated_length": 956.0, "epoch": 0.933853459972863, "frac_reward_zero_std": 0.0, "grad_norm": 0.14447851479053497, "kl": 0.0, "learning_rate": 3.3816425120772945e-08, "loss": -0.0016, "num_tokens": 81505665.0, "reward": 1.1500000953674316, "reward_std": 0.09246459603309631, "rewards/correctness_reward_func/mean": 0.8499999642372131, "rewards/correctness_reward_func/std": 0.09045340120792389, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2753 }, { "completion_length": 644.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1002.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 644.6666870117188, "completions/mean_terminated_length": 644.6666870117188, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "epoch": 0.9341926729986432, "frac_reward_zero_std": 0.0, "grad_norm": 0.3457517921924591, "kl": 0.0, "learning_rate": 3.364389233954451e-08, "loss": -0.0013, "num_tokens": 81526043.0, "reward": 1.1375000476837158, "reward_std": 0.2863825261592865, "rewards/correctness_reward_func/mean": 0.8499999642372131, "rewards/correctness_reward_func/std": 0.28444522619247437, "rewards/format_reward_func/mean": 0.2875000238418579, "rewards/format_reward_func/std": 0.04330127313733101, "step": 2754 }, { "completion_length": 851.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2057.0, "completions/max_terminated_length": 2057.0, "completions/mean_length": 851.9166870117188, "completions/mean_terminated_length": 851.9166870117188, "completions/min_length": 479.0, "completions/min_terminated_length": 479.0, "epoch": 0.9345318860244234, "frac_reward_zero_std": 0.5, "grad_norm": 0.048769623041152954, "kl": 0.0, "learning_rate": 3.347135955831608e-08, "loss": -0.0004, "num_tokens": 81549346.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2755 }, { "completion_length": 842.0833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1136.0, "completions/max_terminated_length": 1136.0, "completions/mean_length": 842.0833740234375, "completions/mean_terminated_length": 842.0833740234375, "completions/min_length": 635.0, "completions/min_terminated_length": 635.0, "epoch": 0.9348710990502035, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.329882677708764e-08, "loss": 0.0, "num_tokens": 81573095.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2756 }, { "completion_length": 1417.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3001.0, "completions/max_terminated_length": 3001.0, "completions/mean_length": 1417.0833740234375, "completions/mean_terminated_length": 1417.0833740234375, "completions/min_length": 587.0, "completions/min_terminated_length": 587.0, "epoch": 0.9352103120759837, "frac_reward_zero_std": 0.0, "grad_norm": 0.7032153606414795, "kl": 0.0, "learning_rate": 3.3126293995859216e-08, "loss": -0.0128, "num_tokens": 81596418.0, "reward": 0.9166666865348816, "reward_std": 0.46005114912986755, "rewards/correctness_reward_func/mean": 0.6166666746139526, "rewards/correctness_reward_func/std": 0.4628632962703705, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2757 }, { "completion_length": 553.5833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 835.0, "completions/max_terminated_length": 835.0, "completions/mean_length": 553.5833740234375, "completions/mean_terminated_length": 553.5833740234375, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "epoch": 0.9355495251017639, "frac_reward_zero_std": 1.0, "grad_norm": 9.640053377779623e-08, "kl": 0.0, "learning_rate": 3.2953761214630775e-08, "loss": 0.0, "num_tokens": 81614671.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2758 }, { "completion_length": 1350.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2435.0, "completions/max_terminated_length": 2435.0, "completions/mean_length": 1350.5, "completions/mean_terminated_length": 1350.5, "completions/min_length": 683.0, "completions/min_terminated_length": 683.0, "epoch": 0.9358887381275441, "frac_reward_zero_std": 0.0, "grad_norm": 0.4094085395336151, "kl": 0.0, "learning_rate": 3.278122843340234e-08, "loss": -0.0065, "num_tokens": 81642421.0, "reward": 1.133333444595337, "reward_std": 0.24494896829128265, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.2806917726993561, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2759 }, { "completion_length": 1131.5000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 2371.0, "completions/max_terminated_length": 2371.0, "completions/mean_length": 1131.5, "completions/mean_terminated_length": 1131.5, "completions/min_length": 621.0, "completions/min_terminated_length": 621.0, "epoch": 0.9362279511533242, "frac_reward_zero_std": 0.0, "grad_norm": 0.3299919068813324, "kl": 0.0, "learning_rate": 3.260869565217391e-08, "loss": 0.0029, "num_tokens": 81670117.0, "reward": 1.1666667461395264, "reward_std": 0.23490384221076965, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.287096232175827, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2760 }, { "completion_length": 1148.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2630.0, "completions/max_terminated_length": 2630.0, "completions/mean_length": 1148.5, "completions/mean_terminated_length": 1148.5, "completions/min_length": 722.0, "completions/min_terminated_length": 722.0, "epoch": 0.9365671641791045, "frac_reward_zero_std": 0.5, "grad_norm": 0.10499429702758789, "kl": 0.0, "learning_rate": 3.243616287094548e-08, "loss": -0.0029, "num_tokens": 81696883.0, "reward": 1.25, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2761 }, { "completion_length": 1072.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2122.0, "completions/max_terminated_length": 2122.0, "completions/mean_length": 1072.666748046875, "completions/mean_terminated_length": 1072.666748046875, "completions/min_length": 483.0, "completions/min_terminated_length": 483.0, "epoch": 0.9369063772048847, "frac_reward_zero_std": 0.5, "grad_norm": 0.08810733258724213, "kl": 0.0, "learning_rate": 3.2263630089717046e-08, "loss": 0.0004, "num_tokens": 81719085.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2762 }, { "completion_length": 514.9166717529297, "completions/clipped_ratio": 0.0, "completions/max_length": 762.0, "completions/max_terminated_length": 762.0, "completions/mean_length": 514.9166870117188, "completions/mean_terminated_length": 514.9166870117188, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "epoch": 0.9372455902306649, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.209109730848861e-08, "loss": 0.0, "num_tokens": 81741062.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2763 }, { "completion_length": 739.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1636.0, "completions/max_terminated_length": 1636.0, "completions/mean_length": 739.3333740234375, "completions/mean_terminated_length": 739.3333740234375, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 0.937584803256445, "frac_reward_zero_std": 0.0, "grad_norm": 0.6073358654975891, "kl": 0.0, "learning_rate": 3.191856452726017e-08, "loss": 0.002, "num_tokens": 81761832.0, "reward": 0.8666666746139526, "reward_std": 0.44320234656333923, "rewards/correctness_reward_func/mean": 0.5666666626930237, "rewards/correctness_reward_func/std": 0.42497774958610535, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2764 }, { "completion_length": 1597.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3178.0, "completions/max_terminated_length": 3178.0, "completions/mean_length": 1597.8333740234375, "completions/mean_terminated_length": 1597.8333740234375, "completions/min_length": 615.0, "completions/min_terminated_length": 615.0, "epoch": 0.9379240162822252, "frac_reward_zero_std": 0.5, "grad_norm": 0.29867738485336304, "kl": 0.0, "learning_rate": 3.1746031746031744e-08, "loss": -0.0007, "num_tokens": 81796816.0, "reward": 0.7000000476837158, "reward_std": 0.19999998807907104, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4972652792930603, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2765 }, { "completion_length": 896.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1957.0, "completions/max_terminated_length": 1957.0, "completions/mean_length": 896.9166870117188, "completions/mean_terminated_length": 896.9166870117188, "completions/min_length": 502.0, "completions/min_terminated_length": 502.0, "epoch": 0.9382632293080054, "frac_reward_zero_std": 0.5, "grad_norm": 0.1044485941529274, "kl": 0.0, "learning_rate": 3.157349896480331e-08, "loss": -0.001, "num_tokens": 81820101.0, "reward": 1.2333333492279053, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.9333333373069763, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2766 }, { "completion_length": 1060.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 1823.0, "completions/max_terminated_length": 1823.0, "completions/mean_length": 1060.0, "completions/mean_terminated_length": 1060.0, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "epoch": 0.9386024423337856, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.1400966183574877e-08, "loss": 0.0, "num_tokens": 81845727.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2767 }, { "completion_length": 1666.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2880.0, "completions/max_terminated_length": 2880.0, "completions/mean_length": 1666.5, "completions/mean_terminated_length": 1666.5, "completions/min_length": 1116.0, "completions/min_terminated_length": 1116.0, "epoch": 0.9389416553595658, "frac_reward_zero_std": 0.5, "grad_norm": 0.07462151348590851, "kl": 0.0, "learning_rate": 3.122843340234644e-08, "loss": -0.0008, "num_tokens": 81879651.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2768 }, { "completion_length": 1043.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2014.0, "completions/max_terminated_length": 2014.0, "completions/mean_length": 1043.166748046875, "completions/mean_terminated_length": 1043.166748046875, "completions/min_length": 372.0, "completions/min_terminated_length": 372.0, "epoch": 0.939280868385346, "frac_reward_zero_std": 0.5, "grad_norm": 0.4429294764995575, "kl": 0.0, "learning_rate": 3.105590062111801e-08, "loss": 0.0006, "num_tokens": 81903107.0, "reward": 0.9666668176651001, "reward_std": 0.20655910670757294, "rewards/correctness_reward_func/mean": 0.6666666865348816, "rewards/correctness_reward_func/std": 0.31139957904815674, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2769 }, { "completion_length": 564.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 852.0, "completions/max_terminated_length": 852.0, "completions/mean_length": 564.8333740234375, "completions/mean_terminated_length": 564.8333740234375, "completions/min_length": 399.0, "completions/min_terminated_length": 399.0, "epoch": 0.9396200814111262, "frac_reward_zero_std": 1.0, "grad_norm": 1.1486537232485716e-07, "kl": 0.0, "learning_rate": 3.088336783988958e-08, "loss": 0.0, "num_tokens": 81924279.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2770 }, { "completion_length": 764.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1737.0, "completions/max_terminated_length": 1737.0, "completions/mean_length": 764.5833740234375, "completions/mean_terminated_length": 764.5833740234375, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.9399592944369064, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.071083505866114e-08, "loss": 0.0, "num_tokens": 81942202.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2771 }, { "completion_length": 1128.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 1993.0, "completions/max_terminated_length": 1993.0, "completions/mean_length": 1128.416748046875, "completions/mean_terminated_length": 1128.416748046875, "completions/min_length": 662.0, "completions/min_terminated_length": 662.0, "epoch": 0.9402985074626866, "frac_reward_zero_std": 0.5, "grad_norm": 0.06828495115041733, "kl": 0.0, "learning_rate": 3.0538302277432714e-08, "loss": -0.0016, "num_tokens": 81968805.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2772 }, { "completion_length": 742.0000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1049.0, "completions/max_terminated_length": 1049.0, "completions/mean_length": 742.0, "completions/mean_terminated_length": 742.0, "completions/min_length": 470.0, "completions/min_terminated_length": 470.0, "epoch": 0.9406377204884667, "frac_reward_zero_std": 1.0, "grad_norm": 1.2155248896306148e-07, "kl": 0.0, "learning_rate": 3.036576949620428e-08, "loss": 0.0, "num_tokens": 81986703.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2773 }, { "completion_length": 912.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1639.0, "completions/max_terminated_length": 1639.0, "completions/mean_length": 912.0833740234375, "completions/mean_terminated_length": 912.0833740234375, "completions/min_length": 608.0, "completions/min_terminated_length": 608.0, "epoch": 0.9409769335142469, "frac_reward_zero_std": 1.0, "grad_norm": 8.988226340989058e-08, "kl": 0.0, "learning_rate": 3.0193236714975846e-08, "loss": 0.0, "num_tokens": 82008508.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2774 }, { "completion_length": 636.5833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1120.0, "completions/max_terminated_length": 1120.0, "completions/mean_length": 636.5833740234375, "completions/mean_terminated_length": 636.5833740234375, "completions/min_length": 458.0, "completions/min_terminated_length": 458.0, "epoch": 0.9413161465400272, "frac_reward_zero_std": 0.5, "grad_norm": 0.06924429535865784, "kl": 0.0, "learning_rate": 3.002070393374741e-08, "loss": -0.0001, "num_tokens": 82025693.0, "reward": 1.1500000953674316, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.8499999642372131, "rewards/correctness_reward_func/std": 0.09045340120792389, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2775 }, { "completion_length": 535.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 759.0, "completions/max_terminated_length": 759.0, "completions/mean_length": 535.3333740234375, "completions/mean_terminated_length": 535.3333740234375, "completions/min_length": 371.0, "completions/min_terminated_length": 371.0, "epoch": 0.9416553595658074, "frac_reward_zero_std": 1.0, "grad_norm": 1.0421382512504351e-07, "kl": 0.0, "learning_rate": 2.984817115251898e-08, "loss": 0.0, "num_tokens": 82044837.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2776 }, { "completion_length": 963.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1559.0, "completions/max_terminated_length": 1559.0, "completions/mean_length": 963.1666870117188, "completions/mean_terminated_length": 963.1666870117188, "completions/min_length": 517.0, "completions/min_terminated_length": 517.0, "epoch": 0.9419945725915875, "frac_reward_zero_std": 0.0, "grad_norm": 0.11357308179140091, "kl": 0.0, "learning_rate": 2.9675638371290544e-08, "loss": 0.0003, "num_tokens": 82065113.0, "reward": 1.25, "reward_std": 0.09246455878019333, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2777 }, { "completion_length": 1389.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2517.0, "completions/max_terminated_length": 2517.0, "completions/mean_length": 1389.3333740234375, "completions/mean_terminated_length": 1389.3333740234375, "completions/min_length": 466.0, "completions/min_terminated_length": 466.0, "epoch": 0.9423337856173677, "frac_reward_zero_std": 1.0, "grad_norm": 2.470472395543766e-07, "kl": 0.0, "learning_rate": 2.950310559006211e-08, "loss": 0.0, "num_tokens": 82092609.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2778 }, { "completion_length": 930.4167175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 2396.0, "completions/max_terminated_length": 2396.0, "completions/mean_length": 930.4166870117188, "completions/mean_terminated_length": 930.4166870117188, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.9426729986431479, "frac_reward_zero_std": 0.5, "grad_norm": 0.03445601463317871, "kl": 0.0, "learning_rate": 2.933057280883368e-08, "loss": -0.0001, "num_tokens": 82117508.0, "reward": 0.7875000834465027, "reward_std": 0.03061862848699093, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.2874999940395355, "rewards/format_reward_func/std": 0.04330127313733101, "step": 2779 }, { "completion_length": 1476.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2409.0, "completions/max_terminated_length": 2409.0, "completions/mean_length": 1476.75, "completions/mean_terminated_length": 1476.75, "completions/min_length": 673.0, "completions/min_terminated_length": 673.0, "epoch": 0.9430122116689281, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.9158040027605242e-08, "loss": 0.0, "num_tokens": 82144997.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2780 }, { "completion_length": 592.5833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1082.0, "completions/max_terminated_length": 1082.0, "completions/mean_length": 592.5833740234375, "completions/mean_terminated_length": 592.5833740234375, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.9433514246947082, "frac_reward_zero_std": 0.5, "grad_norm": 0.07974367588758469, "kl": 0.0, "learning_rate": 2.898550724637681e-08, "loss": 0.0005, "num_tokens": 82169340.0, "reward": 1.2333333492279053, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.9333333373069763, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2781 }, { "completion_length": 1213.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2163.0, "completions/max_terminated_length": 2163.0, "completions/mean_length": 1213.5833740234375, "completions/mean_terminated_length": 1213.5833740234375, "completions/min_length": 491.0, "completions/min_terminated_length": 491.0, "epoch": 0.9436906377204884, "frac_reward_zero_std": 0.5, "grad_norm": 0.07879916578531265, "kl": 0.0, "learning_rate": 2.8812974465148378e-08, "loss": -0.0, "num_tokens": 82200823.0, "reward": 1.133333444595337, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2782 }, { "completion_length": 1224.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2552.0, "completions/max_terminated_length": 2552.0, "completions/mean_length": 1224.166748046875, "completions/mean_terminated_length": 1224.166748046875, "completions/min_length": 596.0, "completions/min_terminated_length": 596.0, "epoch": 0.9440298507462687, "frac_reward_zero_std": 0.0, "grad_norm": 0.30684712529182434, "kl": 0.0, "learning_rate": 2.8640441683919944e-08, "loss": 0.0018, "num_tokens": 82227729.0, "reward": 0.8000000715255737, "reward_std": 0.21493908762931824, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.4472135901451111, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2783 }, { "completion_length": 668.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 830.0, "completions/max_terminated_length": 830.0, "completions/mean_length": 668.0833740234375, "completions/mean_terminated_length": 668.0833740234375, "completions/min_length": 498.0, "completions/min_terminated_length": 498.0, "epoch": 0.9443690637720489, "frac_reward_zero_std": 1.0, "grad_norm": 1.0367643454856079e-07, "kl": 0.0, "learning_rate": 2.846790890269151e-08, "loss": 0.0, "num_tokens": 82244764.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2784 }, { "completion_length": 1654.5, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5620.0, "completions/mean_length": 2203.58349609375, "completions/mean_terminated_length": 1804.9091796875, "completions/min_length": 473.0, "completions/min_terminated_length": 473.0, "epoch": 0.944708276797829, "frac_reward_zero_std": 0.0, "grad_norm": 0.4633944630622864, "kl": NaN, "learning_rate": 2.8295376121463076e-08, "loss": -0.0435, "num_tokens": 82273978.0, "reward": 1.1083333492279053, "reward_std": 0.3061862587928772, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.28069180250167847, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 2785 }, { "completion_length": 1184.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1936.0, "completions/max_terminated_length": 1936.0, "completions/mean_length": 1184.0833740234375, "completions/mean_terminated_length": 1184.0833740234375, "completions/min_length": 664.0, "completions/min_terminated_length": 664.0, "epoch": 0.9450474898236092, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.8122843340234642e-08, "loss": 0.0, "num_tokens": 82304291.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2786 }, { "completion_length": 738.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 954.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 738.6666870117188, "completions/mean_terminated_length": 738.6666870117188, "completions/min_length": 503.0, "completions/min_terminated_length": 503.0, "epoch": 0.9453867028493894, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.795031055900621e-08, "loss": 0.0, "num_tokens": 82328923.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2787 }, { "completion_length": 644.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1252.0, "completions/max_terminated_length": 1252.0, "completions/mean_length": 644.75, "completions/mean_terminated_length": 644.75, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.9457259158751696, "frac_reward_zero_std": 1.0, "grad_norm": 1.221811629648073e-07, "kl": 0.0, "learning_rate": 2.7777777777777774e-08, "loss": 0.0, "num_tokens": 82348756.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2788 }, { "completion_length": 2065.5834350585938, "completions/clipped_ratio": 0.0, "completions/max_length": 3992.0, "completions/max_terminated_length": 3992.0, "completions/mean_length": 2065.58349609375, "completions/mean_terminated_length": 2065.58349609375, "completions/min_length": 1096.0, "completions/min_terminated_length": 1096.0, "epoch": 0.9460651289009498, "frac_reward_zero_std": 0.5, "grad_norm": 0.5832897424697876, "kl": 0.0, "learning_rate": 2.7605244996549343e-08, "loss": -0.0037, "num_tokens": 82383839.0, "reward": 1.0333333015441895, "reward_std": 0.25819891691207886, "rewards/correctness_reward_func/mean": 0.7333333492279053, "rewards/correctness_reward_func/std": 0.44585633277893066, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2789 }, { "completion_length": 1483.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 5053.0, "completions/max_terminated_length": 5053.0, "completions/mean_length": 1483.8333740234375, "completions/mean_terminated_length": 1483.8333740234375, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "epoch": 0.94640434192673, "frac_reward_zero_std": 0.5, "grad_norm": 0.5928127765655518, "kl": 0.0, "learning_rate": 2.743271221532091e-08, "loss": 0.0292, "num_tokens": 82416987.0, "reward": 0.7166666984558105, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.5149286985397339, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2790 }, { "completion_length": 1272.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2626.0, "completions/max_terminated_length": 2626.0, "completions/mean_length": 1272.666748046875, "completions/mean_terminated_length": 1272.666748046875, "completions/min_length": 537.0, "completions/min_terminated_length": 537.0, "epoch": 0.9467435549525102, "frac_reward_zero_std": 0.5, "grad_norm": 0.6962557435035706, "kl": 0.0, "learning_rate": 2.7260179434092475e-08, "loss": 0.0167, "num_tokens": 82445519.0, "reward": 0.5499999523162842, "reward_std": 0.27386125922203064, "rewards/correctness_reward_func/mean": 0.25, "rewards/correctness_reward_func/std": 0.45226702094078064, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2791 }, { "completion_length": 1651.5000610351562, "completions/clipped_ratio": 0.41666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 5337.0, "completions/mean_length": 4396.9169921875, "completions/mean_terminated_length": 2831.14306640625, "completions/min_length": 1216.0, "completions/min_terminated_length": 1216.0, "epoch": 0.9470827679782904, "frac_reward_zero_std": 0.0, "grad_norm": 0.7158437967300415, "kl": NaN, "learning_rate": 2.708764665286404e-08, "loss": -0.0612, "num_tokens": 82479461.0, "reward": 0.42500001192092896, "reward_std": 0.3350985050201416, "rewards/correctness_reward_func/mean": 0.25, "rewards/correctness_reward_func/std": 0.45226702094078064, "rewards/format_reward_func/mean": 0.17500001192092896, "rewards/format_reward_func/std": 0.15447859466075897, "step": 2792 }, { "completion_length": 1038.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 2139.0, "completions/max_terminated_length": 2139.0, "completions/mean_length": 1038.75, "completions/mean_terminated_length": 1038.75, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 0.9474219810040706, "frac_reward_zero_std": 1.0, "grad_norm": 1.1280956613290982e-07, "kl": 0.0, "learning_rate": 2.691511387163561e-08, "loss": 0.0, "num_tokens": 82501856.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2793 }, { "completion_length": 959.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 2735.0, "completions/max_terminated_length": 2735.0, "completions/mean_length": 959.75, "completions/mean_terminated_length": 959.75, "completions/min_length": 423.0, "completions/min_terminated_length": 423.0, "epoch": 0.9477611940298507, "frac_reward_zero_std": 1.0, "grad_norm": 1.69140449202132e-07, "kl": 0.0, "learning_rate": 2.6742581090407174e-08, "loss": 0.0, "num_tokens": 82523609.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2794 }, { "completion_length": 1062.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1915.0, "completions/max_terminated_length": 1915.0, "completions/mean_length": 1062.75, "completions/mean_terminated_length": 1062.75, "completions/min_length": 559.0, "completions/min_terminated_length": 559.0, "epoch": 0.9481004070556309, "frac_reward_zero_std": 0.5, "grad_norm": 0.09556128084659576, "kl": 0.0, "learning_rate": 2.6570048309178743e-08, "loss": -0.0012, "num_tokens": 82551572.0, "reward": 1.2666666507720947, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2795 }, { "completion_length": 2183.8334350585938, "completions/clipped_ratio": 0.0, "completions/max_length": 5969.0, "completions/max_terminated_length": 5969.0, "completions/mean_length": 2183.83349609375, "completions/mean_terminated_length": 2183.83349609375, "completions/min_length": 567.0, "completions/min_terminated_length": 567.0, "epoch": 0.9484396200814111, "frac_reward_zero_std": 1.0, "grad_norm": 9.819653712384024e-08, "kl": 0.0, "learning_rate": 2.639751552795031e-08, "loss": 0.0, "num_tokens": 82590024.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2796 }, { "completion_length": 1663.0000610351562, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 6533.0, "completions/mean_length": 2761.166748046875, "completions/mean_terminated_length": 1995.5999755859375, "completions/min_length": 590.0, "completions/min_terminated_length": 590.0, "epoch": 0.9487788331071914, "frac_reward_zero_std": 0.5, "grad_norm": 0.4905873239040375, "kl": NaN, "learning_rate": 2.6224982746721875e-08, "loss": -0.072, "num_tokens": 82622862.0, "reward": 0.9166667461395264, "reward_std": 0.2840188145637512, "rewards/correctness_reward_func/mean": 0.6666666865348816, "rewards/correctness_reward_func/std": 0.31139957904815674, "rewards/format_reward_func/mean": 0.25, "rewards/format_reward_func/std": 0.11677484214305878, "step": 2797 }, { "completion_length": 679.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1031.0, "completions/max_terminated_length": 1031.0, "completions/mean_length": 679.3333740234375, "completions/mean_terminated_length": 679.3333740234375, "completions/min_length": 463.0, "completions/min_terminated_length": 463.0, "epoch": 0.9491180461329715, "frac_reward_zero_std": 0.5, "grad_norm": 0.07221175730228424, "kl": 0.0, "learning_rate": 2.605244996549344e-08, "loss": -0.0002, "num_tokens": 82641358.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2798 }, { "completion_length": 1456.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 5083.0, "completions/max_terminated_length": 5083.0, "completions/mean_length": 1456.5, "completions/mean_terminated_length": 1456.5, "completions/min_length": 520.0, "completions/min_terminated_length": 520.0, "epoch": 0.9494572591587517, "frac_reward_zero_std": 0.0, "grad_norm": 0.691153347492218, "kl": 0.0, "learning_rate": 2.587991718426501e-08, "loss": 0.0407, "num_tokens": 82665838.0, "reward": 1.1666667461395264, "reward_std": 0.25163978338241577, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.287096232175827, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2799 }, { "completion_length": 1884.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3758.0, "completions/max_terminated_length": 3758.0, "completions/mean_length": 1884.666748046875, "completions/mean_terminated_length": 1884.666748046875, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "epoch": 0.9497964721845319, "frac_reward_zero_std": 0.5, "grad_norm": 0.29616227746009827, "kl": 0.0, "learning_rate": 2.5707384403036573e-08, "loss": 0.0029, "num_tokens": 82699386.0, "reward": 1.0, "reward_std": 0.21908903121948242, "rewards/correctness_reward_func/mean": 0.699999988079071, "rewards/correctness_reward_func/std": 0.43064433336257935, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2800 }, { "completion_length": 1365.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 2206.0, "completions/max_terminated_length": 2206.0, "completions/mean_length": 1365.916748046875, "completions/mean_terminated_length": 1365.916748046875, "completions/min_length": 536.0, "completions/min_terminated_length": 536.0, "epoch": 0.9501356852103121, "frac_reward_zero_std": 1.0, "grad_norm": 1.7834641141689644e-07, "kl": 0.0, "learning_rate": 2.5534851621808143e-08, "loss": 0.0, "num_tokens": 82729139.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2801 }, { "completion_length": 1766.5, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 5774.0, "completions/mean_length": 3413.75, "completions/mean_terminated_length": 2355.333251953125, "completions/min_length": 859.0, "completions/min_terminated_length": 859.0, "epoch": 0.9504748982360922, "frac_reward_zero_std": 0.5, "grad_norm": 0.04109666123986244, "kl": NaN, "learning_rate": 2.5362318840579712e-08, "loss": 0.0052, "num_tokens": 82763201.0, "reward": 0.8166667222976685, "reward_std": 0.20165979862213135, "rewards/correctness_reward_func/mean": 0.5666666626930237, "rewards/correctness_reward_func/std": 0.5033223032951355, "rewards/format_reward_func/mean": 0.2500000298023224, "rewards/format_reward_func/std": 0.11677484214305878, "step": 2802 }, { "completion_length": 548.0000152587891, "completions/clipped_ratio": 0.0, "completions/max_length": 754.0, "completions/max_terminated_length": 754.0, "completions/mean_length": 548.0, "completions/mean_terminated_length": 548.0, "completions/min_length": 415.0, "completions/min_terminated_length": 415.0, "epoch": 0.9508141112618724, "frac_reward_zero_std": 0.0, "grad_norm": 0.07300414144992828, "kl": 0.0, "learning_rate": 2.5189786059351275e-08, "loss": 0.0, "num_tokens": 82780565.0, "reward": 1.2000000476837158, "reward_std": 0.08164963126182556, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2803 }, { "completion_length": 666.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 954.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 666.3333740234375, "completions/mean_terminated_length": 666.3333740234375, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "epoch": 0.9511533242876526, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.5017253278122844e-08, "loss": 0.0, "num_tokens": 82799211.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2804 }, { "completion_length": 746.0833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1006.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 746.0833740234375, "completions/mean_terminated_length": 746.0833740234375, "completions/min_length": 540.0, "completions/min_terminated_length": 540.0, "epoch": 0.9514925373134329, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.4844720496894407e-08, "loss": 0.0, "num_tokens": 82816696.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2805 }, { "completion_length": 2548.5833740234375, "completions/clipped_ratio": 0.41666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 5877.0, "completions/mean_length": 5294.0, "completions/mean_terminated_length": 4369.0, "completions/min_length": 2211.0, "completions/min_terminated_length": 2211.0, "epoch": 0.9518317503392131, "frac_reward_zero_std": 0.5, "grad_norm": 0.2608247697353363, "kl": NaN, "learning_rate": 2.4672187715665976e-08, "loss": -0.0167, "num_tokens": 82858931.0, "reward": 0.17500001192092896, "reward_std": 0.06123724579811096, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.17499999701976776, "rewards/format_reward_func/std": 0.15447859466075897, "step": 2806 }, { "completion_length": 1205.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2946.0, "completions/max_terminated_length": 2946.0, "completions/mean_length": 1205.25, "completions/mean_terminated_length": 1205.25, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 0.9521709633649932, "frac_reward_zero_std": 1.0, "grad_norm": 1.2968797591383918e-07, "kl": 0.0, "learning_rate": 2.4499654934437542e-08, "loss": 0.0, "num_tokens": 82885844.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2807 }, { "completion_length": 1082.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2555.0, "completions/max_terminated_length": 2555.0, "completions/mean_length": 1082.416748046875, "completions/mean_terminated_length": 1082.416748046875, "completions/min_length": 441.0, "completions/min_terminated_length": 441.0, "epoch": 0.9525101763907734, "frac_reward_zero_std": 0.5, "grad_norm": 0.07481320202350616, "kl": 0.0, "learning_rate": 2.432712215320911e-08, "loss": 0.0023, "num_tokens": 82905973.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2808 }, { "completion_length": 749.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1450.0, "completions/max_terminated_length": 1450.0, "completions/mean_length": 749.3333740234375, "completions/mean_terminated_length": 749.3333740234375, "completions/min_length": 463.0, "completions/min_terminated_length": 463.0, "epoch": 0.9528493894165536, "frac_reward_zero_std": 0.0, "grad_norm": 0.11949588358402252, "kl": 0.0, "learning_rate": 2.4154589371980675e-08, "loss": 0.0004, "num_tokens": 82929047.0, "reward": 1.2000000476837158, "reward_std": 0.10954447835683823, "rewards/correctness_reward_func/mean": 0.9000000357627869, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2809 }, { "completion_length": 1455.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2762.0, "completions/max_terminated_length": 2762.0, "completions/mean_length": 1455.5, "completions/mean_terminated_length": 1455.5, "completions/min_length": 405.0, "completions/min_terminated_length": 405.0, "epoch": 0.9531886024423338, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.3982056590752244e-08, "loss": 0.0, "num_tokens": 82957325.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2810 }, { "completion_length": 1354.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2694.0, "completions/max_terminated_length": 2694.0, "completions/mean_length": 1354.5, "completions/mean_terminated_length": 1354.5, "completions/min_length": 446.0, "completions/min_terminated_length": 446.0, "epoch": 0.9535278154681139, "frac_reward_zero_std": 0.5, "grad_norm": 0.46453961730003357, "kl": 0.0, "learning_rate": 2.3809523809523807e-08, "loss": -0.0185, "num_tokens": 82984559.0, "reward": 0.9000000953674316, "reward_std": 0.21908903121948242, "rewards/correctness_reward_func/mean": 0.6000000238418579, "rewards/correctness_reward_func/std": 0.36181363463401794, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2811 }, { "completion_length": 1038.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1634.0, "completions/max_terminated_length": 1634.0, "completions/mean_length": 1038.75, "completions/mean_terminated_length": 1038.75, "completions/min_length": 662.0, "completions/min_terminated_length": 662.0, "epoch": 0.9538670284938942, "frac_reward_zero_std": 1.0, "grad_norm": 1.0349900492201414e-07, "kl": 0.0, "learning_rate": 2.3636991028295376e-08, "loss": 0.0, "num_tokens": 83007266.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2812 }, { "completion_length": 768.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1663.0, "completions/max_terminated_length": 1663.0, "completions/mean_length": 768.25, "completions/mean_terminated_length": 768.25, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.9542062415196744, "frac_reward_zero_std": 0.5, "grad_norm": 0.09539683908224106, "kl": 0.0, "learning_rate": 2.3464458247066942e-08, "loss": 0.0, "num_tokens": 83027435.0, "reward": 0.7666666507720947, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2813 }, { "completion_length": 786.0000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1257.0, "completions/max_terminated_length": 1257.0, "completions/mean_length": 786.0, "completions/mean_terminated_length": 786.0, "completions/min_length": 540.0, "completions/min_terminated_length": 540.0, "epoch": 0.9545454545454546, "frac_reward_zero_std": 0.0, "grad_norm": 0.10480193048715591, "kl": 0.0, "learning_rate": 2.3291925465838508e-08, "loss": 0.0004, "num_tokens": 83053799.0, "reward": 1.2000000476837158, "reward_std": 0.08164963126182556, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2814 }, { "completion_length": 1222.9167175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 2312.0, "completions/max_terminated_length": 2312.0, "completions/mean_length": 1222.916748046875, "completions/mean_terminated_length": 1222.916748046875, "completions/min_length": 468.0, "completions/min_terminated_length": 468.0, "epoch": 0.9548846675712347, "frac_reward_zero_std": 0.5, "grad_norm": 0.26129332184791565, "kl": 0.0, "learning_rate": 2.3119392684610074e-08, "loss": -0.0011, "num_tokens": 83081494.0, "reward": 1.1166667938232422, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.8166666030883789, "rewards/correctness_reward_func/std": 0.27579087018966675, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2815 }, { "completion_length": 2311.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4950.0, "completions/max_terminated_length": 4950.0, "completions/mean_length": 2311.33349609375, "completions/mean_terminated_length": 2311.33349609375, "completions/min_length": 1182.0, "completions/min_terminated_length": 1182.0, "epoch": 0.9552238805970149, "frac_reward_zero_std": 0.0, "grad_norm": 0.576535701751709, "kl": 0.0, "learning_rate": 2.2946859903381644e-08, "loss": -0.0111, "num_tokens": 83121668.0, "reward": 0.8666666746139526, "reward_std": 0.24494892358779907, "rewards/correctness_reward_func/mean": 0.5666666626930237, "rewards/correctness_reward_func/std": 0.5033223032951355, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2816 }, { "completion_length": 538.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 851.0, "completions/max_terminated_length": 851.0, "completions/mean_length": 538.3333740234375, "completions/mean_terminated_length": 538.3333740234375, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 0.9555630936227951, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.2774327122153206e-08, "loss": 0.0, "num_tokens": 83139984.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2817 }, { "completion_length": 2058.7500610351562, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 6367.0, "completions/mean_length": 3156.916748046875, "completions/mean_terminated_length": 2470.5, "completions/min_length": 680.0, "completions/min_terminated_length": 680.0, "epoch": 0.9559023066485753, "frac_reward_zero_std": 0.5, "grad_norm": 0.19753044843673706, "kl": NaN, "learning_rate": 2.2601794340924776e-08, "loss": -0.0165, "num_tokens": 83177517.0, "reward": 0.6625000834465027, "reward_std": 0.06274950504302979, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.26250001788139343, "rewards/format_reward_func/std": 0.09323723614215851, "step": 2818 }, { "completion_length": 1208.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1737.0, "completions/max_terminated_length": 1737.0, "completions/mean_length": 1208.5833740234375, "completions/mean_terminated_length": 1208.5833740234375, "completions/min_length": 613.0, "completions/min_terminated_length": 613.0, "epoch": 0.9562415196743554, "frac_reward_zero_std": 0.5, "grad_norm": 0.08930449932813644, "kl": 0.0, "learning_rate": 2.2429261559696342e-08, "loss": -0.0001, "num_tokens": 83205178.0, "reward": 1.2333333492279053, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.9333333373069763, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2819 }, { "completion_length": 1527.7500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 4048.0, "completions/max_terminated_length": 4048.0, "completions/mean_length": 1527.75, "completions/mean_terminated_length": 1527.75, "completions/min_length": 724.0, "completions/min_terminated_length": 724.0, "epoch": 0.9565807327001357, "frac_reward_zero_std": 0.5, "grad_norm": 1.0994954109191895, "kl": 0.0, "learning_rate": 2.2256728778467908e-08, "loss": -0.0193, "num_tokens": 83238109.0, "reward": 0.9166667461395264, "reward_std": 0.24013885855674744, "rewards/correctness_reward_func/mean": 0.6166666746139526, "rewards/correctness_reward_func/std": 0.37618499994277954, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2820 }, { "completion_length": 1096.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3092.0, "completions/max_terminated_length": 3092.0, "completions/mean_length": 1096.25, "completions/mean_terminated_length": 1096.25, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.9569199457259159, "frac_reward_zero_std": 1.0, "grad_norm": 1.674805218954134e-07, "kl": 0.0, "learning_rate": 2.2084195997239474e-08, "loss": 0.0, "num_tokens": 83265598.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2821 }, { "completion_length": 701.5833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1669.0, "completions/max_terminated_length": 1669.0, "completions/mean_length": 701.5833740234375, "completions/mean_terminated_length": 701.5833740234375, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 0.9572591587516961, "frac_reward_zero_std": 0.5, "grad_norm": 0.09564792364835739, "kl": 0.0, "learning_rate": 2.1911663216011043e-08, "loss": 0.0013, "num_tokens": 83283953.0, "reward": 1.2333333492279053, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.9333333373069763, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2822 }, { "completion_length": 1029.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2426.0, "completions/max_terminated_length": 2426.0, "completions/mean_length": 1029.916748046875, "completions/mean_terminated_length": 1029.916748046875, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.9575983717774763, "frac_reward_zero_std": 0.5, "grad_norm": 0.08715655654668808, "kl": 0.0, "learning_rate": 2.1739130434782606e-08, "loss": 0.0006, "num_tokens": 83308414.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2823 }, { "completion_length": 4155.08349609375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5889.0, "completions/mean_length": 4704.1669921875, "completions/mean_terminated_length": 4532.818359375, "completions/min_length": 2089.0, "completions/min_terminated_length": 2089.0, "epoch": 0.9579375848032564, "frac_reward_zero_std": 0.0, "grad_norm": 0.6586392521858215, "kl": NaN, "learning_rate": 2.1566597653554176e-08, "loss": -0.0371, "num_tokens": 83369033.0, "reward": 0.8416666984558105, "reward_std": 0.26536136865615845, "rewards/correctness_reward_func/mean": 0.5666666626930237, "rewards/correctness_reward_func/std": 0.5033223032951355, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 2824 }, { "completion_length": 1391.166748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3403.0, "completions/max_terminated_length": 3403.0, "completions/mean_length": 1391.166748046875, "completions/mean_terminated_length": 1391.166748046875, "completions/min_length": 699.0, "completions/min_terminated_length": 699.0, "epoch": 0.9582767978290366, "frac_reward_zero_std": 1.0, "grad_norm": 3.8722285466974427e-07, "kl": 0.0, "learning_rate": 2.1394064872325738e-08, "loss": 0.0, "num_tokens": 83396077.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2825 }, { "completion_length": 894.0000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1238.0, "completions/max_terminated_length": 1238.0, "completions/mean_length": 894.0, "completions/mean_terminated_length": 894.0, "completions/min_length": 583.0, "completions/min_terminated_length": 583.0, "epoch": 0.9586160108548168, "frac_reward_zero_std": 0.5, "grad_norm": 0.07710827887058258, "kl": 0.0, "learning_rate": 2.1221532091097308e-08, "loss": 0.0, "num_tokens": 83416699.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2826 }, { "completion_length": 836.0833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1519.0, "completions/max_terminated_length": 1519.0, "completions/mean_length": 836.0833740234375, "completions/mean_terminated_length": 836.0833740234375, "completions/min_length": 382.0, "completions/min_terminated_length": 382.0, "epoch": 0.9589552238805971, "frac_reward_zero_std": 0.5, "grad_norm": 0.07595382630825043, "kl": 0.0, "learning_rate": 2.1048999309868874e-08, "loss": -0.0008, "num_tokens": 83439278.0, "reward": 1.1500000953674316, "reward_std": 0.05477222427725792, "rewards/correctness_reward_func/mean": 0.8500000834465027, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2827 }, { "completion_length": 939.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1501.0, "completions/max_terminated_length": 1501.0, "completions/mean_length": 939.6666870117188, "completions/mean_terminated_length": 939.6666870117188, "completions/min_length": 516.0, "completions/min_terminated_length": 516.0, "epoch": 0.9592944369063772, "frac_reward_zero_std": 0.5, "grad_norm": 0.08681418746709824, "kl": 0.0, "learning_rate": 2.087646652864044e-08, "loss": 0.0004, "num_tokens": 83460754.0, "reward": 1.2166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298573225736618, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2828 }, { "completion_length": 1020.9167175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 1862.0, "completions/max_terminated_length": 1862.0, "completions/mean_length": 1020.9166870117188, "completions/mean_terminated_length": 1020.9166870117188, "completions/min_length": 410.0, "completions/min_terminated_length": 410.0, "epoch": 0.9596336499321574, "frac_reward_zero_std": 0.5, "grad_norm": 0.10518840700387955, "kl": 0.0, "learning_rate": 2.0703933747412006e-08, "loss": -0.0026, "num_tokens": 83481087.0, "reward": 1.2666666507720947, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2829 }, { "completion_length": 1048.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2946.0, "completions/max_terminated_length": 2946.0, "completions/mean_length": 1048.416748046875, "completions/mean_terminated_length": 1048.416748046875, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "epoch": 0.9599728629579376, "frac_reward_zero_std": 0.5, "grad_norm": 0.07589062303304672, "kl": 0.0, "learning_rate": 2.0531400966183575e-08, "loss": 0.0004, "num_tokens": 83505278.0, "reward": 1.133333444595337, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2830 }, { "completion_length": 1810.9166870117188, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 5921.0, "completions/mean_length": 2360.0, "completions/mean_terminated_length": 1975.5455322265625, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.9603120759837178, "frac_reward_zero_std": 0.5, "grad_norm": 0.14964275062084198, "kl": NaN, "learning_rate": 2.0358868184955138e-08, "loss": -0.0124, "num_tokens": 83538769.0, "reward": 0.7749999761581421, "reward_std": 0.06123724579811096, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 2831 }, { "completion_length": 2474.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4023.0, "completions/max_terminated_length": 4023.0, "completions/mean_length": 2474.33349609375, "completions/mean_terminated_length": 2474.33349609375, "completions/min_length": 774.0, "completions/min_terminated_length": 774.0, "epoch": 0.9606512890094979, "frac_reward_zero_std": 0.0, "grad_norm": 0.8354182243347168, "kl": 0.0, "learning_rate": 2.0186335403726707e-08, "loss": 0.023, "num_tokens": 83578541.0, "reward": 0.7666666507720947, "reward_std": 0.4647580087184906, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2832 }, { "completion_length": 697.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1270.0, "completions/max_terminated_length": 1270.0, "completions/mean_length": 697.0, "completions/mean_terminated_length": 697.0, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "epoch": 0.9609905020352781, "frac_reward_zero_std": 0.5, "grad_norm": 0.0554022453725338, "kl": 0.0, "learning_rate": 2.0013802622498273e-08, "loss": -0.0001, "num_tokens": 83598545.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2833 }, { "completion_length": 1252.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2438.0, "completions/max_terminated_length": 2438.0, "completions/mean_length": 1252.916748046875, "completions/mean_terminated_length": 1252.916748046875, "completions/min_length": 656.0, "completions/min_terminated_length": 656.0, "epoch": 0.9613297150610584, "frac_reward_zero_std": 1.0, "grad_norm": 2.276112525123608e-07, "kl": 0.0, "learning_rate": 1.984126984126984e-08, "loss": 0.0, "num_tokens": 83623912.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2834 }, { "completion_length": 886.5000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1639.0, "completions/max_terminated_length": 1639.0, "completions/mean_length": 886.5, "completions/mean_terminated_length": 886.5, "completions/min_length": 509.0, "completions/min_terminated_length": 509.0, "epoch": 0.9616689280868386, "frac_reward_zero_std": 0.5, "grad_norm": 0.07829717546701431, "kl": 0.0, "learning_rate": 1.9668737060041406e-08, "loss": -0.0001, "num_tokens": 83645830.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2835 }, { "completion_length": 1194.9167175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 2688.0, "completions/max_terminated_length": 2688.0, "completions/mean_length": 1194.916748046875, "completions/mean_terminated_length": 1194.916748046875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.9620081411126187, "frac_reward_zero_std": 0.5, "grad_norm": 0.4215218126773834, "kl": 0.0, "learning_rate": 1.9496204278812975e-08, "loss": -0.0028, "num_tokens": 83676369.0, "reward": 1.133333444595337, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.2806917726993561, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2836 }, { "completion_length": 1498.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3193.0, "completions/max_terminated_length": 3193.0, "completions/mean_length": 1498.5, "completions/mean_terminated_length": 1498.5, "completions/min_length": 563.0, "completions/min_terminated_length": 563.0, "epoch": 0.9623473541383989, "frac_reward_zero_std": 0.0, "grad_norm": 0.18940430879592896, "kl": 0.0, "learning_rate": 1.9323671497584538e-08, "loss": -0.0072, "num_tokens": 83709393.0, "reward": 1.2166666984558105, "reward_std": 0.10641199350357056, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2837 }, { "completion_length": 969.4167175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 1584.0, "completions/max_terminated_length": 1584.0, "completions/mean_length": 969.4166870117188, "completions/mean_terminated_length": 969.4166870117188, "completions/min_length": 464.0, "completions/min_terminated_length": 464.0, "epoch": 0.9626865671641791, "frac_reward_zero_std": 0.5, "grad_norm": 0.09722740203142166, "kl": 0.0, "learning_rate": 1.9151138716356107e-08, "loss": 0.0012, "num_tokens": 83734826.0, "reward": 1.2666666507720947, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2838 }, { "completion_length": 889.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1776.0, "completions/max_terminated_length": 1776.0, "completions/mean_length": 889.4166870117188, "completions/mean_terminated_length": 889.4166870117188, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "epoch": 0.9630257801899593, "frac_reward_zero_std": 1.0, "grad_norm": 1.54896511617153e-07, "kl": 0.0, "learning_rate": 1.8978605935127676e-08, "loss": 0.0, "num_tokens": 83755801.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2839 }, { "completion_length": 810.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1168.0, "completions/max_terminated_length": 1168.0, "completions/mean_length": 810.5833740234375, "completions/mean_terminated_length": 810.5833740234375, "completions/min_length": 506.0, "completions/min_terminated_length": 506.0, "epoch": 0.9633649932157394, "frac_reward_zero_std": 0.5, "grad_norm": 0.08907705545425415, "kl": 0.0, "learning_rate": 1.880607315389924e-08, "loss": -0.0003, "num_tokens": 83778290.0, "reward": 1.25, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2840 }, { "completion_length": 1059.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2727.0, "completions/max_terminated_length": 2727.0, "completions/mean_length": 1059.0, "completions/mean_terminated_length": 1059.0, "completions/min_length": 580.0, "completions/min_terminated_length": 580.0, "epoch": 0.9637042062415196, "frac_reward_zero_std": 0.0, "grad_norm": 0.41762709617614746, "kl": 0.0, "learning_rate": 1.863354037267081e-08, "loss": 0.002, "num_tokens": 83801150.0, "reward": 0.949999988079071, "reward_std": 0.29902368783950806, "rewards/correctness_reward_func/mean": 0.6500000357627869, "rewards/correctness_reward_func/std": 0.4833594560623169, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2841 }, { "completion_length": 1105.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 2961.0, "completions/max_terminated_length": 2961.0, "completions/mean_length": 1105.25, "completions/mean_terminated_length": 1105.25, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "epoch": 0.9640434192672999, "frac_reward_zero_std": 0.0, "grad_norm": 0.1140049546957016, "kl": 0.0, "learning_rate": 1.8461007591442375e-08, "loss": -0.0006, "num_tokens": 83827499.0, "reward": 1.25, "reward_std": 0.09246455878019333, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2842 }, { "completion_length": 929.0000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1197.0, "completions/max_terminated_length": 1197.0, "completions/mean_length": 929.0, "completions/mean_terminated_length": 929.0, "completions/min_length": 532.0, "completions/min_terminated_length": 532.0, "epoch": 0.9643826322930801, "frac_reward_zero_std": 0.5, "grad_norm": 0.0990026518702507, "kl": 0.0, "learning_rate": 1.828847481021394e-08, "loss": 0.0002, "num_tokens": 83853521.0, "reward": 0.75, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.44999995827674866, "rewards/correctness_reward_func/std": 0.47577688097953796, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2843 }, { "completion_length": 855.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1233.0, "completions/max_terminated_length": 1233.0, "completions/mean_length": 855.75, "completions/mean_terminated_length": 855.75, "completions/min_length": 368.0, "completions/min_terminated_length": 368.0, "epoch": 0.9647218453188603, "frac_reward_zero_std": 0.5, "grad_norm": 0.10271433740854263, "kl": 0.0, "learning_rate": 1.8115942028985507e-08, "loss": -0.0001, "num_tokens": 83876216.0, "reward": 1.25, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2844 }, { "completion_length": 1787.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2973.0, "completions/max_terminated_length": 2973.0, "completions/mean_length": 1787.5833740234375, "completions/mean_terminated_length": 1787.5833740234375, "completions/min_length": 711.0, "completions/min_terminated_length": 711.0, "epoch": 0.9650610583446404, "frac_reward_zero_std": 0.5, "grad_norm": 0.6743563413619995, "kl": 0.0, "learning_rate": 1.7943409247757073e-08, "loss": 0.0014, "num_tokens": 83912631.0, "reward": 0.9500000476837158, "reward_std": 0.27386125922203064, "rewards/correctness_reward_func/mean": 0.6499999761581421, "rewards/correctness_reward_func/std": 0.40113475918769836, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2845 }, { "completion_length": 1942.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3760.0, "completions/max_terminated_length": 3760.0, "completions/mean_length": 1942.8333740234375, "completions/mean_terminated_length": 1942.8333740234375, "completions/min_length": 1107.0, "completions/min_terminated_length": 1107.0, "epoch": 0.9654002713704206, "frac_reward_zero_std": 1.0, "grad_norm": 1.5326969560192083e-07, "kl": 0.0, "learning_rate": 1.777087646652864e-08, "loss": 0.0, "num_tokens": 83951005.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2846 }, { "completion_length": 611.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 912.0, "completions/max_terminated_length": 912.0, "completions/mean_length": 611.75, "completions/mean_terminated_length": 611.75, "completions/min_length": 454.0, "completions/min_terminated_length": 454.0, "epoch": 0.9657394843962008, "frac_reward_zero_std": 0.0, "grad_norm": 0.10781288146972656, "kl": 0.0, "learning_rate": 1.7598343685300208e-08, "loss": -0.0011, "num_tokens": 83969110.0, "reward": 1.25, "reward_std": 0.09246455132961273, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2847 }, { "completion_length": 2369.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5753.0, "completions/max_terminated_length": 5753.0, "completions/mean_length": 2369.0, "completions/mean_terminated_length": 2369.0, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "epoch": 0.966078697421981, "frac_reward_zero_std": 0.5, "grad_norm": 0.698265552520752, "kl": 0.0, "learning_rate": 1.742581090407177e-08, "loss": 0.0097, "num_tokens": 84009808.0, "reward": 1.066666841506958, "reward_std": 0.18618986010551453, "rewards/correctness_reward_func/mean": 0.7666667103767395, "rewards/correctness_reward_func/std": 0.25346091389656067, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2848 }, { "completion_length": 1077.0000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 2406.0, "completions/max_terminated_length": 2406.0, "completions/mean_length": 1077.0, "completions/mean_terminated_length": 1077.0, "completions/min_length": 557.0, "completions/min_terminated_length": 557.0, "epoch": 0.9664179104477612, "frac_reward_zero_std": 0.5, "grad_norm": 0.06727226823568344, "kl": 0.0, "learning_rate": 1.725327812284334e-08, "loss": -0.0007, "num_tokens": 84036406.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2849 }, { "completion_length": 815.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 2268.0, "completions/max_terminated_length": 2268.0, "completions/mean_length": 815.8333740234375, "completions/mean_terminated_length": 815.8333740234375, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "epoch": 0.9667571234735414, "frac_reward_zero_std": 0.5, "grad_norm": 0.5351754426956177, "kl": 0.0, "learning_rate": 1.7080745341614906e-08, "loss": 0.0168, "num_tokens": 84060800.0, "reward": 1.2166666984558105, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.28867512941360474, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2850 }, { "completion_length": 1536.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 3104.0, "completions/max_terminated_length": 3104.0, "completions/mean_length": 1536.3333740234375, "completions/mean_terminated_length": 1536.3333740234375, "completions/min_length": 453.0, "completions/min_terminated_length": 453.0, "epoch": 0.9670963364993216, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.6908212560386473e-08, "loss": 0.0, "num_tokens": 84093498.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2851 }, { "completion_length": 658.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1275.0, "completions/max_terminated_length": 1275.0, "completions/mean_length": 658.25, "completions/mean_terminated_length": 658.25, "completions/min_length": 435.0, "completions/min_terminated_length": 435.0, "epoch": 0.9674355495251018, "frac_reward_zero_std": 1.0, "grad_norm": 2.5428948902117554e-07, "kl": 0.0, "learning_rate": 1.673567977915804e-08, "loss": 0.0, "num_tokens": 84111483.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2852 }, { "completion_length": 1751.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2477.0, "completions/max_terminated_length": 2477.0, "completions/mean_length": 1751.416748046875, "completions/mean_terminated_length": 1751.416748046875, "completions/min_length": 850.0, "completions/min_terminated_length": 850.0, "epoch": 0.9677747625508819, "frac_reward_zero_std": 0.5, "grad_norm": 0.10446907579898834, "kl": 0.0, "learning_rate": 1.6563146997929608e-08, "loss": -0.0029, "num_tokens": 84146144.0, "reward": 1.25, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2853 }, { "completion_length": 788.0000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1241.0, "completions/max_terminated_length": 1241.0, "completions/mean_length": 788.0, "completions/mean_terminated_length": 788.0, "completions/min_length": 533.0, "completions/min_terminated_length": 533.0, "epoch": 0.9681139755766621, "frac_reward_zero_std": 0.5, "grad_norm": 0.059606585651636124, "kl": 0.0, "learning_rate": 1.639061421670117e-08, "loss": 0.0001, "num_tokens": 84165980.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2854 }, { "completion_length": 865.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1977.0, "completions/max_terminated_length": 1977.0, "completions/mean_length": 865.9166870117188, "completions/mean_terminated_length": 865.9166870117188, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.9684531886024423, "frac_reward_zero_std": 0.5, "grad_norm": 0.07165784388780594, "kl": 0.0, "learning_rate": 1.621808143547274e-08, "loss": -0.0003, "num_tokens": 84190693.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2855 }, { "completion_length": 941.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1587.0, "completions/max_terminated_length": 1587.0, "completions/mean_length": 941.8333740234375, "completions/mean_terminated_length": 941.8333740234375, "completions/min_length": 619.0, "completions/min_terminated_length": 619.0, "epoch": 0.9687924016282226, "frac_reward_zero_std": 0.5, "grad_norm": 0.3992924094200134, "kl": 0.0, "learning_rate": 1.6045548654244306e-08, "loss": 0.0017, "num_tokens": 84212435.0, "reward": 1.066666603088379, "reward_std": 0.20655912160873413, "rewards/correctness_reward_func/mean": 0.7666666507720947, "rewards/correctness_reward_func/std": 0.3700941801071167, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2856 }, { "completion_length": 1301.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2923.0, "completions/max_terminated_length": 2923.0, "completions/mean_length": 1301.0833740234375, "completions/mean_terminated_length": 1301.0833740234375, "completions/min_length": 704.0, "completions/min_terminated_length": 704.0, "epoch": 0.9691316146540027, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.5873015873015872e-08, "loss": 0.0, "num_tokens": 84235536.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2857 }, { "completion_length": 758.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1037.0, "completions/max_terminated_length": 1037.0, "completions/mean_length": 758.3333740234375, "completions/mean_terminated_length": 758.3333740234375, "completions/min_length": 527.0, "completions/min_terminated_length": 527.0, "epoch": 0.9694708276797829, "frac_reward_zero_std": 1.0, "grad_norm": 1.0521717541678299e-07, "kl": 0.0, "learning_rate": 1.5700483091787438e-08, "loss": 0.0, "num_tokens": 84257890.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2858 }, { "completion_length": 2547.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4334.0, "completions/max_terminated_length": 4334.0, "completions/mean_length": 2547.416748046875, "completions/mean_terminated_length": 2547.416748046875, "completions/min_length": 953.0, "completions/min_terminated_length": 953.0, "epoch": 0.9698100407055631, "frac_reward_zero_std": 0.0, "grad_norm": 1.5106278657913208, "kl": 0.0, "learning_rate": 1.5527950310559004e-08, "loss": 0.0085, "num_tokens": 84295359.0, "reward": 0.8000000715255737, "reward_std": 0.42149817943573, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.4472135901451111, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2859 }, { "completion_length": 721.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1494.0, "completions/max_terminated_length": 1494.0, "completions/mean_length": 721.3333740234375, "completions/mean_terminated_length": 721.3333740234375, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.9701492537313433, "frac_reward_zero_std": 0.5, "grad_norm": 0.10399395227432251, "kl": 0.0, "learning_rate": 1.535541752933057e-08, "loss": -0.0011, "num_tokens": 84318487.0, "reward": 1.2666666507720947, "reward_std": 0.051639750599861145, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2860 }, { "completion_length": 2452.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 6132.0, "completions/max_terminated_length": 6132.0, "completions/mean_length": 2452.916748046875, "completions/mean_terminated_length": 2452.916748046875, "completions/min_length": 973.0, "completions/min_terminated_length": 973.0, "epoch": 0.9704884667571235, "frac_reward_zero_std": 0.5, "grad_norm": 0.5409563779830933, "kl": 0.0, "learning_rate": 1.518288474810214e-08, "loss": 0.0185, "num_tokens": 84360384.0, "reward": 0.833333432674408, "reward_std": 0.20655912160873413, "rewards/correctness_reward_func/mean": 0.5333333611488342, "rewards/correctness_reward_func/std": 0.3938928246498108, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2861 }, { "completion_length": 974.5000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1462.0, "completions/max_terminated_length": 1462.0, "completions/mean_length": 974.5, "completions/mean_terminated_length": 974.5, "completions/min_length": 602.0, "completions/min_terminated_length": 602.0, "epoch": 0.9708276797829036, "frac_reward_zero_std": 0.0, "grad_norm": 0.4106888473033905, "kl": 0.0, "learning_rate": 1.5010351966873706e-08, "loss": -0.0056, "num_tokens": 84383856.0, "reward": 0.8666667342185974, "reward_std": 0.2753456234931946, "rewards/correctness_reward_func/mean": 0.5666666626930237, "rewards/correctness_reward_func/std": 0.42497774958610535, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2862 }, { "completion_length": 888.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1347.0, "completions/max_terminated_length": 1347.0, "completions/mean_length": 888.1666870117188, "completions/mean_terminated_length": 888.1666870117188, "completions/min_length": 496.0, "completions/min_terminated_length": 496.0, "epoch": 0.9711668928086838, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.4837819185645272e-08, "loss": 0.0, "num_tokens": 84411236.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2863 }, { "completion_length": 2629.5001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 5298.0, "completions/max_terminated_length": 5298.0, "completions/mean_length": 2629.5, "completions/mean_terminated_length": 2629.5, "completions/min_length": 676.0, "completions/min_terminated_length": 676.0, "epoch": 0.9715061058344641, "frac_reward_zero_std": 0.5, "grad_norm": 0.4353359043598175, "kl": 0.0, "learning_rate": 1.466528640441684e-08, "loss": 0.0124, "num_tokens": 84455954.0, "reward": 0.7666667699813843, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.46666669845581055, "rewards/correctness_reward_func/std": 0.4119429290294647, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2864 }, { "completion_length": 1309.2500610351562, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 2608.0, "completions/mean_length": 1858.3333740234375, "completions/mean_terminated_length": 1428.2728271484375, "completions/min_length": 671.0, "completions/min_terminated_length": 671.0, "epoch": 0.9718453188602443, "frac_reward_zero_std": 0.0, "grad_norm": 0.4283584654331207, "kl": NaN, "learning_rate": 1.4492753623188406e-08, "loss": -0.0108, "num_tokens": 84483533.0, "reward": 0.9416667819023132, "reward_std": 0.38783591985702515, "rewards/correctness_reward_func/mean": 0.6666666865348816, "rewards/correctness_reward_func/std": 0.31139957904815674, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660254627466202, "step": 2865 }, { "completion_length": 691.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1365.0, "completions/max_terminated_length": 1365.0, "completions/mean_length": 691.3333740234375, "completions/mean_terminated_length": 691.3333740234375, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.9721845318860244, "frac_reward_zero_std": 0.5, "grad_norm": 0.04970015957951546, "kl": 0.0, "learning_rate": 1.4320220841959972e-08, "loss": -0.0001, "num_tokens": 84502251.0, "reward": 1.25, "reward_std": 0.05477223917841911, "rewards/correctness_reward_func/mean": 0.949999988079071, "rewards/correctness_reward_func/std": 0.09045339375734329, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2866 }, { "completion_length": 871.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1064.0, "completions/max_terminated_length": 1064.0, "completions/mean_length": 871.9166870117188, "completions/mean_terminated_length": 871.9166870117188, "completions/min_length": 545.0, "completions/min_terminated_length": 545.0, "epoch": 0.9725237449118046, "frac_reward_zero_std": 1.0, "grad_norm": 2.5207478415723017e-07, "kl": 0.0, "learning_rate": 1.4147688060731538e-08, "loss": 0.0, "num_tokens": 84524720.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2867 }, { "completion_length": 2090.416748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4324.0, "completions/max_terminated_length": 4324.0, "completions/mean_length": 2090.416748046875, "completions/mean_terminated_length": 2090.416748046875, "completions/min_length": 761.0, "completions/min_terminated_length": 761.0, "epoch": 0.9728629579375848, "frac_reward_zero_std": 1.0, "grad_norm": 1.177566062438018e-07, "kl": 0.0, "learning_rate": 1.3975155279503106e-08, "loss": 0.0, "num_tokens": 84562729.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2868 }, { "completion_length": 1000.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2201.0, "completions/max_terminated_length": 2201.0, "completions/mean_length": 1000.6666870117188, "completions/mean_terminated_length": 1000.6666870117188, "completions/min_length": 372.0, "completions/min_terminated_length": 372.0, "epoch": 0.973202170963365, "frac_reward_zero_std": 0.5, "grad_norm": 0.07785006612539291, "kl": 0.0, "learning_rate": 1.3802622498274672e-08, "loss": 0.0003, "num_tokens": 84584949.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2869 }, { "completion_length": 605.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1170.0, "completions/max_terminated_length": 1170.0, "completions/mean_length": 605.1666870117188, "completions/mean_terminated_length": 605.1666870117188, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.9735413839891451, "frac_reward_zero_std": 1.0, "grad_norm": 1.4621221566812892e-07, "kl": 0.0, "learning_rate": 1.3630089717046238e-08, "loss": 0.0, "num_tokens": 84608285.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2870 }, { "completion_length": 1453.8333740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 3302.0, "completions/mean_length": 2002.916748046875, "completions/mean_terminated_length": 1586.0, "completions/min_length": 492.0, "completions/min_terminated_length": 492.0, "epoch": 0.9738805970149254, "frac_reward_zero_std": 0.5, "grad_norm": 0.48631492257118225, "kl": NaN, "learning_rate": 1.3457556935817805e-08, "loss": -0.032, "num_tokens": 84636675.0, "reward": 1.008333444595337, "reward_std": 0.22453658282756805, "rewards/correctness_reward_func/mean": 0.7333333492279053, "rewards/correctness_reward_func/std": 0.23094011843204498, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 2871 }, { "completion_length": 864.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1423.0, "completions/max_terminated_length": 1423.0, "completions/mean_length": 864.75, "completions/mean_terminated_length": 864.75, "completions/min_length": 442.0, "completions/min_terminated_length": 442.0, "epoch": 0.9742198100407056, "frac_reward_zero_std": 1.0, "grad_norm": 1.6320369411459978e-07, "kl": 0.0, "learning_rate": 1.3285024154589372e-08, "loss": 0.0, "num_tokens": 84658308.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2872 }, { "completion_length": 1737.5000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 2315.0, "completions/max_terminated_length": 2315.0, "completions/mean_length": 1737.5, "completions/mean_terminated_length": 1737.5, "completions/min_length": 895.0, "completions/min_terminated_length": 895.0, "epoch": 0.9745590230664858, "frac_reward_zero_std": 1.0, "grad_norm": 1.0832415142658647e-07, "kl": 0.0, "learning_rate": 1.3112491373360938e-08, "loss": 0.0, "num_tokens": 84691350.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2873 }, { "completion_length": 1182.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3076.0, "completions/max_terminated_length": 3076.0, "completions/mean_length": 1182.916748046875, "completions/mean_terminated_length": 1182.916748046875, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 0.9748982360922659, "frac_reward_zero_std": 0.5, "grad_norm": 0.046899519860744476, "kl": 0.0, "learning_rate": 1.2939958592132505e-08, "loss": -0.0006, "num_tokens": 84721049.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2874 }, { "completion_length": 1161.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 4001.0, "completions/max_terminated_length": 4001.0, "completions/mean_length": 1161.75, "completions/mean_terminated_length": 1161.75, "completions/min_length": 442.0, "completions/min_terminated_length": 442.0, "epoch": 0.9752374491180461, "frac_reward_zero_std": 0.5, "grad_norm": 0.11237528175115585, "kl": 0.0, "learning_rate": 1.2767425810904071e-08, "loss": -0.0059, "num_tokens": 84747944.0, "reward": 1.2166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298573225736618, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2875 }, { "completion_length": 888.0833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1142.0, "completions/max_terminated_length": 1142.0, "completions/mean_length": 888.0833740234375, "completions/mean_terminated_length": 888.0833740234375, "completions/min_length": 520.0, "completions/min_terminated_length": 520.0, "epoch": 0.9755766621438263, "frac_reward_zero_std": 0.5, "grad_norm": 0.08207489550113678, "kl": 0.0, "learning_rate": 1.2594893029675637e-08, "loss": 0.0004, "num_tokens": 84772029.0, "reward": 1.2666666507720947, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2876 }, { "completion_length": 1120.4167175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 2415.0, "completions/max_terminated_length": 2415.0, "completions/mean_length": 1120.416748046875, "completions/mean_terminated_length": 1120.416748046875, "completions/min_length": 746.0, "completions/min_terminated_length": 746.0, "epoch": 0.9759158751696065, "frac_reward_zero_std": 0.0, "grad_norm": 0.3747636675834656, "kl": 0.0, "learning_rate": 1.2422360248447204e-08, "loss": 0.0025, "num_tokens": 84796418.0, "reward": 0.9166666865348816, "reward_std": 0.24738392233848572, "rewards/correctness_reward_func/mean": 0.6166666746139526, "rewards/correctness_reward_func/std": 0.4628632962703705, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2877 }, { "completion_length": 711.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1104.0, "completions/max_terminated_length": 1104.0, "completions/mean_length": 711.3333740234375, "completions/mean_terminated_length": 711.3333740234375, "completions/min_length": 461.0, "completions/min_terminated_length": 461.0, "epoch": 0.9762550881953868, "frac_reward_zero_std": 1.0, "grad_norm": 1.8021535197476624e-07, "kl": 0.0, "learning_rate": 1.2249827467218771e-08, "loss": 0.0, "num_tokens": 84817194.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2878 }, { "completion_length": 733.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1506.0, "completions/max_terminated_length": 1506.0, "completions/mean_length": 733.0, "completions/mean_terminated_length": 733.0, "completions/min_length": 436.0, "completions/min_terminated_length": 436.0, "epoch": 0.9765943012211669, "frac_reward_zero_std": 0.5, "grad_norm": 0.4308410882949829, "kl": 0.0, "learning_rate": 1.2077294685990337e-08, "loss": 0.0077, "num_tokens": 84839790.0, "reward": 0.9666666388511658, "reward_std": 0.2581988573074341, "rewards/correctness_reward_func/mean": 0.6666666865348816, "rewards/correctness_reward_func/std": 0.4923659861087799, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2879 }, { "completion_length": 1280.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3200.0, "completions/max_terminated_length": 3200.0, "completions/mean_length": 1280.166748046875, "completions/mean_terminated_length": 1280.166748046875, "completions/min_length": 422.0, "completions/min_terminated_length": 422.0, "epoch": 0.9769335142469471, "frac_reward_zero_std": 0.0, "grad_norm": 0.38732224702835083, "kl": 0.0, "learning_rate": 1.1904761904761903e-08, "loss": -0.0021, "num_tokens": 84866138.0, "reward": 1.1666667461395264, "reward_std": 0.2588964104652405, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.287096232175827, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2880 }, { "completion_length": 2231.7501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 5999.0, "completions/max_terminated_length": 5999.0, "completions/mean_length": 2231.75, "completions/mean_terminated_length": 2231.75, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "epoch": 0.9772727272727273, "frac_reward_zero_std": 0.5, "grad_norm": 0.5650548934936523, "kl": 0.0, "learning_rate": 1.1732229123533471e-08, "loss": -0.0304, "num_tokens": 84909665.0, "reward": 0.9666668176651001, "reward_std": 0.20655910670757294, "rewards/correctness_reward_func/mean": 0.6666666865348816, "rewards/correctness_reward_func/std": 0.31139957904815674, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2881 }, { "completion_length": 1412.2500610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3181.0, "completions/max_terminated_length": 3181.0, "completions/mean_length": 1412.25, "completions/mean_terminated_length": 1412.25, "completions/min_length": 573.0, "completions/min_terminated_length": 573.0, "epoch": 0.9776119402985075, "frac_reward_zero_std": 0.5, "grad_norm": 0.31896987557411194, "kl": 0.0, "learning_rate": 1.1559696342305037e-08, "loss": -0.0076, "num_tokens": 84935012.0, "reward": 0.7166666984558105, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.5149286985397339, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2882 }, { "completion_length": 1391.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3327.0, "completions/max_terminated_length": 3327.0, "completions/mean_length": 1391.416748046875, "completions/mean_terminated_length": 1391.416748046875, "completions/min_length": 513.0, "completions/min_terminated_length": 513.0, "epoch": 0.9779511533242876, "frac_reward_zero_std": 0.0, "grad_norm": 0.11799215525388718, "kl": 0.0, "learning_rate": 1.1387163561076603e-08, "loss": 0.0016, "num_tokens": 84958573.0, "reward": 0.6916667819023132, "reward_std": 0.07955466210842133, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.05838742107152939, "step": 2883 }, { "completion_length": 1010.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1657.0, "completions/max_terminated_length": 1657.0, "completions/mean_length": 1010.8333740234375, "completions/mean_terminated_length": 1010.8333740234375, "completions/min_length": 449.0, "completions/min_terminated_length": 449.0, "epoch": 0.9782903663500678, "frac_reward_zero_std": 0.0, "grad_norm": 0.3060120940208435, "kl": 0.0, "learning_rate": 1.1214630779848171e-08, "loss": 0.0035, "num_tokens": 84986339.0, "reward": 1.0500000715255737, "reward_std": 0.20412415266036987, "rewards/correctness_reward_func/mean": 0.75, "rewards/correctness_reward_func/std": 0.24308621883392334, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2884 }, { "completion_length": 2128.58349609375, "completions/clipped_ratio": 0.0, "completions/max_length": 4170.0, "completions/max_terminated_length": 4170.0, "completions/mean_length": 2128.58349609375, "completions/mean_terminated_length": 2128.58349609375, "completions/min_length": 1128.0, "completions/min_terminated_length": 1128.0, "epoch": 0.978629579375848, "frac_reward_zero_std": 0.0, "grad_norm": 0.788396954536438, "kl": 0.0, "learning_rate": 1.1042097998619737e-08, "loss": 0.0329, "num_tokens": 85023456.0, "reward": 0.8000000715255737, "reward_std": 0.42149817943573, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.4472135901451111, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2885 }, { "completion_length": 576.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 781.0, "completions/max_terminated_length": 781.0, "completions/mean_length": 576.3333740234375, "completions/mean_terminated_length": 576.3333740234375, "completions/min_length": 387.0, "completions/min_terminated_length": 387.0, "epoch": 0.9789687924016283, "frac_reward_zero_std": 1.0, "grad_norm": 1.6930594881614525e-07, "kl": 0.0, "learning_rate": 1.0869565217391303e-08, "loss": 0.0, "num_tokens": 85039954.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2886 }, { "completion_length": 1010.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2013.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 1010.8333740234375, "completions/mean_terminated_length": 1010.8333740234375, "completions/min_length": 459.0, "completions/min_terminated_length": 459.0, "epoch": 0.9793080054274084, "frac_reward_zero_std": 1.0, "grad_norm": 1.949144063928543e-07, "kl": 0.0, "learning_rate": 1.0697032436162869e-08, "loss": 0.0, "num_tokens": 85063778.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2887 }, { "completion_length": 1264.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 4206.0, "completions/max_terminated_length": 4206.0, "completions/mean_length": 1264.8333740234375, "completions/mean_terminated_length": 1264.8333740234375, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 0.9796472184531886, "frac_reward_zero_std": 0.5, "grad_norm": 0.0916147455573082, "kl": 0.0, "learning_rate": 1.0524499654934437e-08, "loss": 0.0013, "num_tokens": 85093080.0, "reward": 1.2166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298573225736618, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2888 }, { "completion_length": 1177.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2434.0, "completions/max_terminated_length": 2434.0, "completions/mean_length": 1177.3333740234375, "completions/mean_terminated_length": 1177.3333740234375, "completions/min_length": 502.0, "completions/min_terminated_length": 502.0, "epoch": 0.9799864314789688, "frac_reward_zero_std": 0.5, "grad_norm": 0.24063491821289062, "kl": 0.0, "learning_rate": 1.0351966873706003e-08, "loss": 0.0009, "num_tokens": 85121812.0, "reward": 0.8666666746139526, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.5666666626930237, "rewards/correctness_reward_func/std": 0.5033223032951355, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2889 }, { "completion_length": 1013.9167175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 2182.0, "completions/max_terminated_length": 2182.0, "completions/mean_length": 1013.9166870117188, "completions/mean_terminated_length": 1013.9166870117188, "completions/min_length": 586.0, "completions/min_terminated_length": 586.0, "epoch": 0.980325644504749, "frac_reward_zero_std": 0.5, "grad_norm": 0.05702368542551994, "kl": 0.0, "learning_rate": 1.0179434092477569e-08, "loss": 0.0002, "num_tokens": 85143729.0, "reward": 1.2166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298573225736618, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2890 }, { "completion_length": 538.4166717529297, "completions/clipped_ratio": 0.0, "completions/max_length": 947.0, "completions/max_terminated_length": 947.0, "completions/mean_length": 538.4166870117188, "completions/mean_terminated_length": 538.4166870117188, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.9806648575305291, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.0006901311249137e-08, "loss": 0.0, "num_tokens": 85162106.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2891 }, { "completion_length": 558.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1223.0, "completions/max_terminated_length": 1223.0, "completions/mean_length": 558.3333740234375, "completions/mean_terminated_length": 558.3333740234375, "completions/min_length": 360.0, "completions/min_terminated_length": 360.0, "epoch": 0.9810040705563093, "frac_reward_zero_std": 0.5, "grad_norm": 1.1026588678359985, "kl": 0.0, "learning_rate": 9.834368530020703e-09, "loss": 0.0009, "num_tokens": 85181898.0, "reward": 1.2333333492279053, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.9333333373069763, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2892 }, { "completion_length": 2599.75, "completions/clipped_ratio": 0.0, "completions/max_length": 6383.0, "completions/max_terminated_length": 6383.0, "completions/mean_length": 2599.75, "completions/mean_terminated_length": 2599.75, "completions/min_length": 340.0, "completions/min_terminated_length": 340.0, "epoch": 0.9813432835820896, "frac_reward_zero_std": 0.5, "grad_norm": 0.32802703976631165, "kl": 0.0, "learning_rate": 9.661835748792269e-09, "loss": -0.0101, "num_tokens": 85222809.0, "reward": 0.6499999761581421, "reward_std": 0.17606817185878754, "rewards/correctness_reward_func/mean": 0.3500000238418579, "rewards/correctness_reward_func/std": 0.4358898997306824, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2893 }, { "completion_length": 957.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 2152.0, "completions/max_terminated_length": 2152.0, "completions/mean_length": 957.8333740234375, "completions/mean_terminated_length": 957.8333740234375, "completions/min_length": 652.0, "completions/min_terminated_length": 652.0, "epoch": 0.9816824966078698, "frac_reward_zero_std": 1.0, "grad_norm": 1.1128819465966444e-07, "kl": 0.0, "learning_rate": 9.489302967563838e-09, "loss": 0.0, "num_tokens": 85251529.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2894 }, { "completion_length": 1538.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2935.0, "completions/max_terminated_length": 2935.0, "completions/mean_length": 1538.0833740234375, "completions/mean_terminated_length": 1538.0833740234375, "completions/min_length": 485.0, "completions/min_terminated_length": 485.0, "epoch": 0.98202170963365, "frac_reward_zero_std": 0.5, "grad_norm": 0.26691967248916626, "kl": 0.0, "learning_rate": 9.316770186335404e-09, "loss": 0.0139, "num_tokens": 85280840.0, "reward": 0.36666667461395264, "reward_std": 0.1632993221282959, "rewards/correctness_reward_func/mean": 0.06666667014360428, "rewards/correctness_reward_func/std": 0.2309401035308838, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2895 }, { "completion_length": 2283.8333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 5626.0, "completions/max_terminated_length": 5626.0, "completions/mean_length": 2283.83349609375, "completions/mean_terminated_length": 2283.83349609375, "completions/min_length": 866.0, "completions/min_terminated_length": 866.0, "epoch": 0.9823609226594301, "frac_reward_zero_std": 0.5, "grad_norm": 0.5437328219413757, "kl": 0.0, "learning_rate": 9.14423740510697e-09, "loss": 0.0292, "num_tokens": 85317810.0, "reward": 1.1666667461395264, "reward_std": 0.18618987500667572, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.287096232175827, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2896 }, { "completion_length": 1426.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2752.0, "completions/max_terminated_length": 2752.0, "completions/mean_length": 1426.5833740234375, "completions/mean_terminated_length": 1426.5833740234375, "completions/min_length": 850.0, "completions/min_terminated_length": 850.0, "epoch": 0.9827001356852103, "frac_reward_zero_std": 0.0, "grad_norm": 0.628836452960968, "kl": 0.0, "learning_rate": 8.971704623878536e-09, "loss": -0.0296, "num_tokens": 85348687.0, "reward": 1.0333333015441895, "reward_std": 0.3146860599517822, "rewards/correctness_reward_func/mean": 0.7333333492279053, "rewards/correctness_reward_func/std": 0.44585633277893066, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2897 }, { "completion_length": 825.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1380.0, "completions/max_terminated_length": 1380.0, "completions/mean_length": 825.6666870117188, "completions/mean_terminated_length": 825.6666870117188, "completions/min_length": 464.0, "completions/min_terminated_length": 464.0, "epoch": 0.9830393487109905, "frac_reward_zero_std": 1.0, "grad_norm": 1.8613648933296645e-07, "kl": 0.0, "learning_rate": 8.799171842650104e-09, "loss": 0.0, "num_tokens": 85371369.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2898 }, { "completion_length": 759.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 1179.0, "completions/max_terminated_length": 1179.0, "completions/mean_length": 759.3333740234375, "completions/mean_terminated_length": 759.3333740234375, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "epoch": 0.9833785617367707, "frac_reward_zero_std": 0.5, "grad_norm": 0.07341646403074265, "kl": 0.0, "learning_rate": 8.62663906142167e-09, "loss": 0.0002, "num_tokens": 85391245.0, "reward": 1.1666667461395264, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2899 }, { "completion_length": 988.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1609.0, "completions/max_terminated_length": 1609.0, "completions/mean_length": 988.0, "completions/mean_terminated_length": 988.0, "completions/min_length": 483.0, "completions/min_terminated_length": 483.0, "epoch": 0.9837177747625508, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 8.454106280193236e-09, "loss": 0.0, "num_tokens": 85411759.0, "reward": 0.30000001192092896, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2900 }, { "completion_length": 975.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 3336.0, "completions/max_terminated_length": 3336.0, "completions/mean_length": 975.9166870117188, "completions/mean_terminated_length": 975.9166870117188, "completions/min_length": 468.0, "completions/min_terminated_length": 468.0, "epoch": 0.9840569877883311, "frac_reward_zero_std": 0.5, "grad_norm": 0.5029984712600708, "kl": 0.0, "learning_rate": 8.281573498964804e-09, "loss": 0.008, "num_tokens": 85436946.0, "reward": 0.7000000476837158, "reward_std": 0.20000001788139343, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4972652792930603, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2901 }, { "completion_length": 2329.0001220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 6067.0, "completions/max_terminated_length": 6067.0, "completions/mean_length": 2329.0, "completions/mean_terminated_length": 2329.0, "completions/min_length": 455.0, "completions/min_terminated_length": 455.0, "epoch": 0.9843962008141113, "frac_reward_zero_std": 0.5, "grad_norm": 0.11870120465755463, "kl": 0.0, "learning_rate": 8.10904071773637e-09, "loss": 0.0021, "num_tokens": 85479750.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2902 }, { "completion_length": 592.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 741.0, "completions/max_terminated_length": 741.0, "completions/mean_length": 592.6666870117188, "completions/mean_terminated_length": 592.6666870117188, "completions/min_length": 466.0, "completions/min_terminated_length": 466.0, "epoch": 0.9847354138398915, "frac_reward_zero_std": 1.0, "grad_norm": 8.081712365992644e-08, "kl": 0.0, "learning_rate": 7.936507936507936e-09, "loss": 0.0, "num_tokens": 85500242.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2903 }, { "completion_length": 712.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1008.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 712.25, "completions/mean_terminated_length": 712.25, "completions/min_length": 492.0, "completions/min_terminated_length": 492.0, "epoch": 0.9850746268656716, "frac_reward_zero_std": 0.5, "grad_norm": 0.06843524426221848, "kl": 0.0, "learning_rate": 7.763975155279502e-09, "loss": -0.0003, "num_tokens": 85518701.0, "reward": 1.1666667461395264, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.8666666150093079, "rewards/correctness_reward_func/std": 0.0984731912612915, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2904 }, { "completion_length": 596.5833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 834.0, "completions/max_terminated_length": 834.0, "completions/mean_length": 596.5833740234375, "completions/mean_terminated_length": 596.5833740234375, "completions/min_length": 485.0, "completions/min_terminated_length": 485.0, "epoch": 0.9854138398914518, "frac_reward_zero_std": 0.5, "grad_norm": 0.052025116980075836, "kl": 0.0, "learning_rate": 7.59144237405107e-09, "loss": 0.0002, "num_tokens": 85539036.0, "reward": 1.133333444595337, "reward_std": 0.051639772951602936, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2905 }, { "completion_length": 2533.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 4821.0, "completions/max_terminated_length": 4821.0, "completions/mean_length": 2533.666748046875, "completions/mean_terminated_length": 2533.666748046875, "completions/min_length": 1135.0, "completions/min_terminated_length": 1135.0, "epoch": 0.985753052917232, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 7.418909592822636e-09, "loss": 0.0, "num_tokens": 85578584.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2906 }, { "completion_length": 357.6666717529297, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/max_terminated_length": 578.0, "completions/mean_length": 357.66668701171875, "completions/mean_terminated_length": 357.66668701171875, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.9860922659430122, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 7.246376811594203e-09, "loss": 0.0, "num_tokens": 85595644.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2907 }, { "completion_length": 1095.0000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1693.0, "completions/max_terminated_length": 1693.0, "completions/mean_length": 1095.0, "completions/mean_terminated_length": 1095.0, "completions/min_length": 555.0, "completions/min_terminated_length": 555.0, "epoch": 0.9864314789687924, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 7.073844030365769e-09, "loss": 0.0, "num_tokens": 85621150.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2908 }, { "completion_length": 863.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1870.0, "completions/max_terminated_length": 1870.0, "completions/mean_length": 863.9166870117188, "completions/mean_terminated_length": 863.9166870117188, "completions/min_length": 441.0, "completions/min_terminated_length": 441.0, "epoch": 0.9867706919945726, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 6.901311249137336e-09, "loss": 0.0, "num_tokens": 85641063.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2909 }, { "completion_length": 904.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1570.0, "completions/max_terminated_length": 1570.0, "completions/mean_length": 904.75, "completions/mean_terminated_length": 904.75, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 0.9871099050203528, "frac_reward_zero_std": 0.5, "grad_norm": 0.05365384742617607, "kl": 0.0, "learning_rate": 6.728778467908903e-09, "loss": 0.0006, "num_tokens": 85662948.0, "reward": 1.1166667938232422, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.8166666626930237, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2910 }, { "completion_length": 823.3333740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1638.0, "completions/max_terminated_length": 1638.0, "completions/mean_length": 823.3333740234375, "completions/mean_terminated_length": 823.3333740234375, "completions/min_length": 464.0, "completions/min_terminated_length": 464.0, "epoch": 0.987449118046133, "frac_reward_zero_std": 1.0, "grad_norm": 1.9983797017175675e-07, "kl": 0.0, "learning_rate": 6.556245686680469e-09, "loss": 0.0, "num_tokens": 85686682.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2911 }, { "completion_length": 1024.9167175292969, "completions/clipped_ratio": 0.0, "completions/max_length": 2137.0, "completions/max_terminated_length": 2137.0, "completions/mean_length": 1024.916748046875, "completions/mean_terminated_length": 1024.916748046875, "completions/min_length": 446.0, "completions/min_terminated_length": 446.0, "epoch": 0.9877883310719131, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 6.383712905452036e-09, "loss": 0.0, "num_tokens": 85710525.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2912 }, { "completion_length": 614.2500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 766.0, "completions/max_terminated_length": 766.0, "completions/mean_length": 614.25, "completions/mean_terminated_length": 614.25, "completions/min_length": 492.0, "completions/min_terminated_length": 492.0, "epoch": 0.9881275440976933, "frac_reward_zero_std": 0.5, "grad_norm": 0.07590407878160477, "kl": 0.0, "learning_rate": 6.211180124223602e-09, "loss": -0.0004, "num_tokens": 85725522.0, "reward": 1.2666666507720947, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2913 }, { "completion_length": 1635.4166870117188, "completions/clipped_ratio": 0.25, "completions/max_length": 6589.0, "completions/max_terminated_length": 4726.0, "completions/mean_length": 3282.666748046875, "completions/mean_terminated_length": 2180.5556640625, "completions/min_length": 733.0, "completions/min_terminated_length": 733.0, "epoch": 0.9884667571234735, "frac_reward_zero_std": 0.5, "grad_norm": 0.1833508312702179, "kl": NaN, "learning_rate": 6.038647342995169e-09, "loss": -0.0232, "num_tokens": 85753421.0, "reward": 0.22500000894069672, "reward_std": 0.08215838670730591, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.22500000894069672, "rewards/format_reward_func/std": 0.13568010926246643, "step": 2914 }, { "completion_length": 543.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 687.0, "completions/max_terminated_length": 687.0, "completions/mean_length": 543.4166870117188, "completions/mean_terminated_length": 543.4166870117188, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "epoch": 0.9888059701492538, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 5.8661145617667355e-09, "loss": 0.0, "num_tokens": 85769770.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2915 }, { "completion_length": 591.5833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 783.0, "completions/max_terminated_length": 783.0, "completions/mean_length": 591.5833740234375, "completions/mean_terminated_length": 591.5833740234375, "completions/min_length": 418.0, "completions/min_terminated_length": 418.0, "epoch": 0.989145183175034, "frac_reward_zero_std": 1.0, "grad_norm": 1.765144332921409e-07, "kl": 0.0, "learning_rate": 5.693581780538302e-09, "loss": 0.0, "num_tokens": 85788713.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2916 }, { "completion_length": 2418.5833740234375, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 6528.0, "completions/mean_length": 2967.666748046875, "completions/mean_terminated_length": 2638.45458984375, "completions/min_length": 692.0, "completions/min_terminated_length": 692.0, "epoch": 0.9894843962008141, "frac_reward_zero_std": 0.0, "grad_norm": 0.5191672444343567, "kl": NaN, "learning_rate": 5.5210489993098685e-09, "loss": -0.0148, "num_tokens": 85832274.0, "reward": 0.6916667222976685, "reward_std": 0.42866072058677673, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 2917 }, { "completion_length": 1057.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1918.0, "completions/max_terminated_length": 1918.0, "completions/mean_length": 1057.916748046875, "completions/mean_terminated_length": 1057.916748046875, "completions/min_length": 466.0, "completions/min_terminated_length": 466.0, "epoch": 0.9898236092265943, "frac_reward_zero_std": 1.0, "grad_norm": 1.2170477248218958e-07, "kl": 0.0, "learning_rate": 5.3485162180814346e-09, "loss": 0.0, "num_tokens": 85856519.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2918 }, { "completion_length": 1143.666748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 1958.0, "completions/max_terminated_length": 1958.0, "completions/mean_length": 1143.666748046875, "completions/mean_terminated_length": 1143.666748046875, "completions/min_length": 668.0, "completions/min_terminated_length": 668.0, "epoch": 0.9901628222523745, "frac_reward_zero_std": 0.5, "grad_norm": 0.09391776472330093, "kl": 0.0, "learning_rate": 5.1759834368530015e-09, "loss": 0.0001, "num_tokens": 85881757.0, "reward": 0.7833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.4833333492279053, "rewards/correctness_reward_func/std": 0.5078176856040955, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2919 }, { "completion_length": 533.5833587646484, "completions/clipped_ratio": 0.0, "completions/max_length": 736.0, "completions/max_terminated_length": 736.0, "completions/mean_length": 533.5833740234375, "completions/mean_terminated_length": 533.5833740234375, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "epoch": 0.9905020352781547, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 5.003450655624568e-09, "loss": 0.0, "num_tokens": 85902482.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 1.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2920 }, { "completion_length": 1132.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2305.0, "completions/max_terminated_length": 2305.0, "completions/mean_length": 1132.666748046875, "completions/mean_terminated_length": 1132.666748046875, "completions/min_length": 461.0, "completions/min_terminated_length": 461.0, "epoch": 0.9908412483039348, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.8309178743961344e-09, "loss": 0.0, "num_tokens": 85926664.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2921 }, { "completion_length": 604.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 892.0, "completions/max_terminated_length": 892.0, "completions/mean_length": 604.1666870117188, "completions/mean_terminated_length": 604.1666870117188, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.991180461329715, "frac_reward_zero_std": 1.0, "grad_norm": 1.0964931362877905e-07, "kl": 0.0, "learning_rate": 4.658385093167702e-09, "loss": 0.0, "num_tokens": 85943862.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2922 }, { "completion_length": 581.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1101.0, "completions/max_terminated_length": 1101.0, "completions/mean_length": 581.9166870117188, "completions/mean_terminated_length": 581.9166870117188, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.9915196743554953, "frac_reward_zero_std": 0.5, "grad_norm": 0.38362231850624084, "kl": 0.0, "learning_rate": 4.485852311939268e-09, "loss": -0.0014, "num_tokens": 85961819.0, "reward": 0.9333333373069763, "reward_std": 0.20655912160873413, "rewards/correctness_reward_func/mean": 0.6333333849906921, "rewards/correctness_reward_func/std": 0.4735424220561981, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2923 }, { "completion_length": 1065.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2054.0, "completions/max_terminated_length": 2054.0, "completions/mean_length": 1065.0833740234375, "completions/mean_terminated_length": 1065.0833740234375, "completions/min_length": 519.0, "completions/min_terminated_length": 519.0, "epoch": 0.9918588873812755, "frac_reward_zero_std": 0.0, "grad_norm": 0.12412099540233612, "kl": 0.0, "learning_rate": 4.313319530710835e-09, "loss": -0.0018, "num_tokens": 85991700.0, "reward": 1.183333396911621, "reward_std": 0.10641199350357056, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2924 }, { "completion_length": 1577.6666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2637.0, "completions/max_terminated_length": 2637.0, "completions/mean_length": 1577.666748046875, "completions/mean_terminated_length": 1577.666748046875, "completions/min_length": 777.0, "completions/min_terminated_length": 777.0, "epoch": 0.9921981004070556, "frac_reward_zero_std": 1.0, "grad_norm": 1.130083759903755e-07, "kl": 0.0, "learning_rate": 4.140786749482402e-09, "loss": 0.0, "num_tokens": 86024060.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2925 }, { "completion_length": 684.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1347.0, "completions/max_terminated_length": 1347.0, "completions/mean_length": 684.9166870117188, "completions/mean_terminated_length": 684.9166870117188, "completions/min_length": 389.0, "completions/min_terminated_length": 389.0, "epoch": 0.9925373134328358, "frac_reward_zero_std": 0.5, "grad_norm": 0.07191053777933121, "kl": 0.0, "learning_rate": 3.968253968253968e-09, "loss": -0.0001, "num_tokens": 86044471.0, "reward": 1.2166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.9166666865348816, "rewards/correctness_reward_func/std": 0.10298573225736618, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2926 }, { "completion_length": 1932.916748046875, "completions/clipped_ratio": 0.0, "completions/max_length": 3597.0, "completions/max_terminated_length": 3597.0, "completions/mean_length": 1932.916748046875, "completions/mean_terminated_length": 1932.916748046875, "completions/min_length": 478.0, "completions/min_terminated_length": 478.0, "epoch": 0.992876526458616, "frac_reward_zero_std": 1.0, "grad_norm": 3.771326646528905e-07, "kl": 0.0, "learning_rate": 3.795721187025535e-09, "loss": 0.0, "num_tokens": 86078064.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.8999999165534973, "rewards/correctness_reward_func/std": 0.10444658994674683, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2927 }, { "completion_length": 1152.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 2608.0, "completions/max_terminated_length": 2608.0, "completions/mean_length": 1152.0833740234375, "completions/mean_terminated_length": 1152.0833740234375, "completions/min_length": 567.0, "completions/min_terminated_length": 567.0, "epoch": 0.9932157394843962, "frac_reward_zero_std": 0.5, "grad_norm": 0.09726077318191528, "kl": 0.0, "learning_rate": 3.6231884057971014e-09, "loss": 0.0016, "num_tokens": 86101393.0, "reward": 1.133333444595337, "reward_std": 0.05163976177573204, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2928 }, { "completion_length": 1691.0833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 6335.0, "completions/max_terminated_length": 6335.0, "completions/mean_length": 1691.0833740234375, "completions/mean_terminated_length": 1691.0833740234375, "completions/min_length": 714.0, "completions/min_terminated_length": 714.0, "epoch": 0.9935549525101763, "frac_reward_zero_std": 0.5, "grad_norm": 0.8064724206924438, "kl": 0.0, "learning_rate": 3.450655624568668e-09, "loss": -0.0306, "num_tokens": 86136608.0, "reward": 0.5499999523162842, "reward_std": 0.27386125922203064, "rewards/correctness_reward_func/mean": 0.25, "rewards/correctness_reward_func/std": 0.45226702094078064, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2929 }, { "completion_length": 2303.5833740234375, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 6589.0, "completions/max_terminated_length": 6299.0, "completions/mean_length": 3401.75, "completions/mean_terminated_length": 2764.300048828125, "completions/min_length": 1235.0, "completions/min_terminated_length": 1235.0, "epoch": 0.9938941655359566, "frac_reward_zero_std": 0.0, "grad_norm": 0.36295169591903687, "kl": NaN, "learning_rate": 3.2781228433402344e-09, "loss": -0.0232, "num_tokens": 86178543.0, "reward": 0.6833333969116211, "reward_std": 0.12909945845603943, "rewards/correctness_reward_func/mean": 0.43333330750465393, "rewards/correctness_reward_func/std": 0.45792683959007263, "rewards/format_reward_func/mean": 0.2500000298023224, "rewards/format_reward_func/std": 0.11677484214305878, "step": 2930 }, { "completion_length": 425.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 855.0, "completions/max_terminated_length": 855.0, "completions/mean_length": 425.8333435058594, "completions/mean_terminated_length": 425.8333435058594, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.9942333785617368, "frac_reward_zero_std": 0.5, "grad_norm": 0.06653768569231033, "kl": 0.0, "learning_rate": 3.105590062111801e-09, "loss": 0.0008, "num_tokens": 86194771.0, "reward": 1.2833333015441895, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.9833333492279053, "rewards/correctness_reward_func/std": 0.05773502215743065, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2931 }, { "completion_length": 780.4166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1311.0, "completions/max_terminated_length": 1311.0, "completions/mean_length": 780.4166870117188, "completions/mean_terminated_length": 780.4166870117188, "completions/min_length": 523.0, "completions/min_terminated_length": 523.0, "epoch": 0.994572591587517, "frac_reward_zero_std": 1.0, "grad_norm": 1.3607434823370568e-07, "kl": 0.0, "learning_rate": 2.9330572808833678e-09, "loss": 0.0, "num_tokens": 86217570.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2932 }, { "completion_length": 1142.7500305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 2666.0, "completions/max_terminated_length": 2666.0, "completions/mean_length": 1142.75, "completions/mean_terminated_length": 1142.75, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "epoch": 0.9949118046132972, "frac_reward_zero_std": 1.0, "grad_norm": 2.495310127414996e-07, "kl": 0.0, "learning_rate": 2.7605244996549342e-09, "loss": 0.0, "num_tokens": 86241933.0, "reward": 1.1000001430511475, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.800000011920929, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2933 }, { "completion_length": 689.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1482.0, "completions/max_terminated_length": 1482.0, "completions/mean_length": 689.5833740234375, "completions/mean_terminated_length": 689.5833740234375, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.9952510176390773, "frac_reward_zero_std": 0.5, "grad_norm": 0.6133211255073547, "kl": 0.0, "learning_rate": 2.5879917184265007e-09, "loss": 0.0056, "num_tokens": 86260954.0, "reward": 1.133333444595337, "reward_std": 0.25819891691207886, "rewards/correctness_reward_func/mean": 0.8333333134651184, "rewards/correctness_reward_func/std": 0.3892494738101959, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2934 }, { "completion_length": 3166.2501220703125, "completions/clipped_ratio": 0.0, "completions/max_length": 6202.0, "completions/max_terminated_length": 6202.0, "completions/mean_length": 3166.25, "completions/mean_terminated_length": 3166.25, "completions/min_length": 1316.0, "completions/min_terminated_length": 1316.0, "epoch": 0.9955902306648575, "frac_reward_zero_std": 0.5, "grad_norm": 0.9037458300590515, "kl": 0.0, "learning_rate": 2.4154589371980672e-09, "loss": -0.0007, "num_tokens": 86312161.0, "reward": 0.46666666865348816, "reward_std": 0.2581988573074341, "rewards/correctness_reward_func/mean": 0.1666666716337204, "rewards/correctness_reward_func/std": 0.3892494738101959, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2935 }, { "completion_length": 1061.3333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 3770.0, "completions/max_terminated_length": 3770.0, "completions/mean_length": 1061.3333740234375, "completions/mean_terminated_length": 1061.3333740234375, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "epoch": 0.9959294436906377, "frac_reward_zero_std": 0.0, "grad_norm": 0.08923051506280899, "kl": 0.0, "learning_rate": 2.242926155969634e-09, "loss": -0.0013, "num_tokens": 86334617.0, "reward": 1.2666666507720947, "reward_std": 0.08164961636066437, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2936 }, { "completion_length": 772.1666870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1195.0, "completions/max_terminated_length": 1195.0, "completions/mean_length": 772.1666870117188, "completions/mean_terminated_length": 772.1666870117188, "completions/min_length": 521.0, "completions/min_terminated_length": 521.0, "epoch": 0.996268656716418, "frac_reward_zero_std": 1.0, "grad_norm": 1.814175760728176e-07, "kl": 0.0, "learning_rate": 2.070393374741201e-09, "loss": 0.0, "num_tokens": 86351569.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2937 }, { "completion_length": 554.5833740234375, "completions/clipped_ratio": 0.0, "completions/max_length": 1096.0, "completions/max_terminated_length": 1096.0, "completions/mean_length": 554.5833740234375, "completions/mean_terminated_length": 554.5833740234375, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.9966078697421981, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.8978605935127675e-09, "loss": 0.0, "num_tokens": 86369870.0, "reward": 0.7999999523162842, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.5, "rewards/correctness_reward_func/std": 0.5222329497337341, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2938 }, { "completion_length": 749.5000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1339.0, "completions/max_terminated_length": 1339.0, "completions/mean_length": 749.5, "completions/mean_terminated_length": 749.5, "completions/min_length": 446.0, "completions/min_terminated_length": 446.0, "epoch": 0.9969470827679783, "frac_reward_zero_std": 0.0, "grad_norm": 0.07330061495304108, "kl": 0.0, "learning_rate": 1.725327812284334e-09, "loss": -0.0012, "num_tokens": 86390240.0, "reward": 1.2666666507720947, "reward_std": 0.08164961636066437, "rewards/correctness_reward_func/mean": 0.9666666984558105, "rewards/correctness_reward_func/std": 0.07784988731145859, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2939 }, { "completion_length": 1260.9167175292969, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 3013.0, "completions/mean_length": 1810.0, "completions/mean_terminated_length": 1375.5455322265625, "completions/min_length": 478.0, "completions/min_terminated_length": 478.0, "epoch": 0.9972862957937585, "frac_reward_zero_std": 0.5, "grad_norm": 0.4693317115306854, "kl": NaN, "learning_rate": 1.5527950310559004e-09, "loss": -0.0269, "num_tokens": 86417737.0, "reward": 1.008333444595337, "reward_std": 0.22453658282756805, "rewards/correctness_reward_func/mean": 0.7333333492279053, "rewards/correctness_reward_func/std": 0.23094011843204498, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 2940 }, { "completion_length": 421.8333435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 636.0, "completions/max_terminated_length": 636.0, "completions/mean_length": 421.8333435058594, "completions/mean_terminated_length": 421.8333435058594, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.9976255088195387, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.3802622498274671e-09, "loss": 0.0, "num_tokens": 86439977.0, "reward": 0.30000001192092896, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.0, "rewards/correctness_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2941 }, { "completion_length": 966.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 1804.0, "completions/max_terminated_length": 1804.0, "completions/mean_length": 966.0, "completions/mean_terminated_length": 966.0, "completions/min_length": 360.0, "completions/min_terminated_length": 360.0, "epoch": 0.9979647218453188, "frac_reward_zero_std": 0.5, "grad_norm": 0.04337142035365105, "kl": 0.0, "learning_rate": 1.2077294685990336e-09, "loss": -0.0009, "num_tokens": 86467973.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2942 }, { "completion_length": 1048.5833435058594, "completions/clipped_ratio": 0.0, "completions/max_length": 2676.0, "completions/max_terminated_length": 2676.0, "completions/mean_length": 1048.5833740234375, "completions/mean_terminated_length": 1048.5833740234375, "completions/min_length": 698.0, "completions/min_terminated_length": 698.0, "epoch": 0.998303934871099, "frac_reward_zero_std": 0.5, "grad_norm": 0.09704183042049408, "kl": 0.0, "learning_rate": 1.0351966873706005e-09, "loss": -0.0022, "num_tokens": 86493384.0, "reward": 0.75, "reward_std": 0.05477222427725792, "rewards/correctness_reward_func/mean": 0.45000001788139343, "rewards/correctness_reward_func/std": 0.47577688097953796, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2943 }, { "completion_length": 937.6666870117188, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 6589.0, "completions/max_terminated_length": 2951.0, "completions/mean_length": 1486.75, "completions/mean_terminated_length": 1022.9091186523438, "completions/min_length": 423.0, "completions/min_terminated_length": 423.0, "epoch": 0.9986431478968792, "frac_reward_zero_std": 0.5, "grad_norm": 0.7592448592185974, "kl": NaN, "learning_rate": 8.62663906142167e-10, "loss": -0.0323, "num_tokens": 86517644.0, "reward": 1.024999976158142, "reward_std": 0.3061861991882324, "rewards/correctness_reward_func/mean": 0.75, "rewards/correctness_reward_func/std": 0.45226702094078064, "rewards/format_reward_func/mean": 0.2750000059604645, "rewards/format_reward_func/std": 0.08660253882408142, "step": 2944 }, { "completion_length": 1504.0000610351562, "completions/clipped_ratio": 0.0, "completions/max_length": 3593.0, "completions/max_terminated_length": 3593.0, "completions/mean_length": 1504.0, "completions/mean_terminated_length": 1504.0, "completions/min_length": 490.0, "completions/min_terminated_length": 490.0, "epoch": 0.9989823609226595, "frac_reward_zero_std": 0.5, "grad_norm": 0.049928199499845505, "kl": 0.0, "learning_rate": 6.901311249137336e-10, "loss": -0.0004, "num_tokens": 86545904.0, "reward": 1.183333396911621, "reward_std": 0.040824808180332184, "rewards/correctness_reward_func/mean": 0.8833333849906921, "rewards/correctness_reward_func/std": 0.10298572480678558, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2945 }, { "completion_length": 599.0000305175781, "completions/clipped_ratio": 0.0, "completions/max_length": 1241.0, "completions/max_terminated_length": 1241.0, "completions/mean_length": 599.0, "completions/mean_terminated_length": 599.0, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 0.9993215739484396, "frac_reward_zero_std": 0.5, "grad_norm": 0.0890926644206047, "kl": 0.0, "learning_rate": 5.175983436853002e-10, "loss": 0.0004, "num_tokens": 86565710.0, "reward": 0.7166666984558105, "reward_std": 0.040824826806783676, "rewards/correctness_reward_func/mean": 0.4166666567325592, "rewards/correctness_reward_func/std": 0.4386619031429291, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2946 }, { "completion_length": 906.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 1258.0, "completions/max_terminated_length": 1258.0, "completions/mean_length": 906.9166870117188, "completions/mean_terminated_length": 906.9166870117188, "completions/min_length": 536.0, "completions/min_terminated_length": 536.0, "epoch": 0.9996607869742198, "frac_reward_zero_std": 1.0, "grad_norm": 1.1294814328266511e-07, "kl": 0.0, "learning_rate": 3.450655624568668e-10, "loss": 0.0, "num_tokens": 86585839.0, "reward": 0.7000000476837158, "reward_std": 0.0, "rewards/correctness_reward_func/mean": 0.4000000059604645, "rewards/correctness_reward_func/std": 0.4177863895893097, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2947 }, { "completion_length": 1058.9166870117188, "completions/clipped_ratio": 0.0, "completions/max_length": 2772.0, "completions/max_terminated_length": 2772.0, "completions/mean_length": 1058.916748046875, "completions/mean_terminated_length": 1058.916748046875, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "epoch": 1.0, "frac_reward_zero_std": 0.5, "grad_norm": 0.0893602967262268, "kl": 0.0, "learning_rate": 1.725327812284334e-10, "loss": 0.0013, "num_tokens": 86609982.0, "reward": 0.7666666507720947, "reward_std": 0.05163975805044174, "rewards/correctness_reward_func/mean": 0.46666666865348816, "rewards/correctness_reward_func/std": 0.4923659563064575, "rewards/format_reward_func/mean": 0.30000001192092896, "rewards/format_reward_func/std": 0.0, "step": 2948 } ], "logging_steps": 1, "max_steps": 2948, "num_input_tokens_seen": 86609982, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 6, "trial_name": null, "trial_params": null }