diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,24034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.33921302578018997, + "eval_steps": 500, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "completion_length": 1914.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3331.0, + "completions/max_terminated_length": 3331.0, + "completions/mean_length": 1914.25, + "completions/mean_terminated_length": 1914.25, + "completions/min_length": 678.0, + "completions/min_terminated_length": 678.0, + "epoch": 0.00033921302578018993, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11923123896121979, + "kl": 0.0, + "learning_rate": 0.0, + "loss": 0.0003, + "num_tokens": 36909.0, + "reward": 0.6750000715255737, + "reward_std": 0.061237238347530365, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.05838742107152939, + "step": 1 + }, + { + "completion_length": 358.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 435.0, + "completions/max_terminated_length": 435.0, + "completions/mean_length": 358.5, + "completions/mean_terminated_length": 358.5, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.0006784260515603799, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16774195432662964, + "kl": 0.0, + "learning_rate": 1e-08, + "loss": -0.0001, + "num_tokens": 56733.0, + "reward": 1.062500238418579, + "reward_std": 0.14997151494026184, + "rewards/correctness_reward_func/mean": 0.8500000834465027, + "rewards/correctness_reward_func/std": 0.09045339375734329, + "rewards/format_reward_func/mean": 0.21250002086162567, + "rewards/format_reward_func/std": 0.07723929733037949, + "step": 2 + }, + { + "completion_length": 1888.3334350585938, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4456.0, + "completions/max_terminated_length": 4456.0, + "completions/mean_length": 1888.3333740234375, + "completions/mean_terminated_length": 1888.3333740234375, + "completions/min_length": 647.0, + "completions/min_terminated_length": 647.0, + "epoch": 0.0010176390773405698, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.14695346355438232, + "kl": 0.0, + "learning_rate": 2e-08, + "loss": -0.0074, + "num_tokens": 89689.0, + "reward": 1.133333444595337, + "reward_std": 0.05163976177573204, + "rewards/correctness_reward_func/mean": 0.8333333134651184, + "rewards/correctness_reward_func/std": 0.07784988731145859, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 3 + }, + { + "completion_length": 2936.916748046875, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5567.0, + "completions/mean_length": 3486.0, + "completions/mean_terminated_length": 3203.9091796875, + "completions/min_length": 1971.0, + "completions/min_terminated_length": 1971.0, + "epoch": 0.0013568521031207597, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.444371223449707, + "kl": NaN, + "learning_rate": 3e-08, + "loss": -0.0084, + "num_tokens": 135204.0, + "reward": 1.0125000476837158, + "reward_std": 0.31494051218032837, + "rewards/correctness_reward_func/mean": 0.75, + "rewards/correctness_reward_func/std": 0.45226702094078064, + "rewards/format_reward_func/mean": 0.26250001788139343, + "rewards/format_reward_func/std": 0.06784005463123322, + "step": 4 + }, + { + "completion_length": 2239.5000610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5995.0, + "completions/max_terminated_length": 5995.0, + "completions/mean_length": 2239.5, + "completions/mean_terminated_length": 2239.5, + "completions/min_length": 646.0, + "completions/min_terminated_length": 646.0, + "epoch": 0.0016960651289009499, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1211001873016357, + "kl": 0.0, + "learning_rate": 4e-08, + "loss": 0.0407, + "num_tokens": 174384.0, + "reward": 0.6833333373069763, + "reward_std": 0.5333091616630554, + "rewards/correctness_reward_func/mean": 0.43333330750465393, + "rewards/correctness_reward_func/std": 0.45792683959007263, + "rewards/format_reward_func/mean": 0.25, + "rewards/format_reward_func/std": 0.07385490089654922, + "step": 5 + }, + { + "completion_length": 2380.25, + "completions/clipped_ratio": 0.16666666666666663, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 6126.0, + "completions/mean_length": 3478.416748046875, + "completions/mean_terminated_length": 2856.300048828125, + "completions/min_length": 702.0, + "completions/min_terminated_length": 702.0, + "epoch": 0.0020352781546811396, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.597161054611206, + "kl": NaN, + "learning_rate": 5e-08, + "loss": -0.0538, + "num_tokens": 213567.0, + "reward": 1.0291666984558105, + "reward_std": 0.24208299815654755, + "rewards/correctness_reward_func/mean": 0.7666666507720947, + "rewards/correctness_reward_func/std": 0.3700941801071167, + "rewards/format_reward_func/mean": 0.26250001788139343, + "rewards/format_reward_func/std": 0.09323723614215851, + "step": 6 + }, + { + "completion_length": 1254.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3006.0, + "completions/max_terminated_length": 3006.0, + "completions/mean_length": 1254.75, + "completions/mean_terminated_length": 1254.75, + "completions/min_length": 392.0, + "completions/min_terminated_length": 392.0, + "epoch": 0.0023744911804613297, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.42129963636398315, + "kl": 0.0, + "learning_rate": 6e-08, + "loss": -0.0015, + "num_tokens": 241776.0, + "reward": 0.8166667819023132, + "reward_std": 0.20202915370464325, + "rewards/correctness_reward_func/mean": 0.5666666626930237, + "rewards/correctness_reward_func/std": 0.5033223032951355, + "rewards/format_reward_func/mean": 0.25, + "rewards/format_reward_func/std": 0.07385490089654922, + "step": 7 + }, + { + "completion_length": 634.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1214.0, + "completions/max_terminated_length": 1214.0, + "completions/mean_length": 634.25, + "completions/mean_terminated_length": 634.25, + "completions/min_length": 400.0, + "completions/min_terminated_length": 400.0, + "epoch": 0.0027137042062415195, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0805782824754715, + "kl": 0.0, + "learning_rate": 7e-08, + "loss": -0.0005, + "num_tokens": 262713.0, + "reward": 0.21250002086162567, + "reward_std": 0.06934845447540283, + "rewards/correctness_reward_func/mean": 0.0, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.21250002086162567, + "rewards/format_reward_func/std": 0.07723929733037949, + "step": 8 + }, + { + "completion_length": 1587.1666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2970.0, + "completions/max_terminated_length": 2970.0, + "completions/mean_length": 1587.166748046875, + "completions/mean_terminated_length": 1587.166748046875, + "completions/min_length": 496.0, + "completions/min_terminated_length": 496.0, + "epoch": 0.0030529172320217096, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6700242161750793, + "kl": 0.0, + "learning_rate": 8e-08, + "loss": -0.0088, + "num_tokens": 291287.0, + "reward": 0.9000000953674316, + "reward_std": 0.21908903121948242, + "rewards/correctness_reward_func/mean": 0.6000000238418579, + "rewards/correctness_reward_func/std": 0.36181363463401794, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 9 + }, + { + "completion_length": 1056.4166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2969.0, + "completions/max_terminated_length": 2969.0, + "completions/mean_length": 1056.416748046875, + "completions/mean_terminated_length": 1056.416748046875, + "completions/min_length": 443.0, + "completions/min_terminated_length": 443.0, + "epoch": 0.0033921302578018998, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.37774741649627686, + "kl": 0.0, + "learning_rate": 9e-08, + "loss": -0.0003, + "num_tokens": 314290.0, + "reward": 0.949999988079071, + "reward_std": 0.25819891691207886, + "rewards/correctness_reward_func/mean": 0.699999988079071, + "rewards/correctness_reward_func/std": 0.3357488214969635, + "rewards/format_reward_func/mean": 0.2500000298023224, + "rewards/format_reward_func/std": 0.07385490089654922, + "step": 10 + }, + { + "completion_length": 3576.666748046875, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5869.0, + "completions/mean_length": 4125.75, + "completions/mean_terminated_length": 3901.818359375, + "completions/min_length": 1444.0, + "completions/min_terminated_length": 1444.0, + "epoch": 0.0037313432835820895, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22871029376983643, + "kl": NaN, + "learning_rate": 1e-07, + "loss": -0.017, + "num_tokens": 372348.0, + "reward": 0.6791667342185974, + "reward_std": 0.10357433557510376, + "rewards/correctness_reward_func/mean": 0.4166666567325592, + "rewards/correctness_reward_func/std": 0.4386619031429291, + "rewards/format_reward_func/mean": 0.26250001788139343, + "rewards/format_reward_func/std": 0.09323723614215851, + "step": 11 + }, + { + "completion_length": 1701.9166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4018.0, + "completions/max_terminated_length": 4018.0, + "completions/mean_length": 1701.916748046875, + "completions/mean_terminated_length": 1701.916748046875, + "completions/min_length": 489.0, + "completions/min_terminated_length": 489.0, + "epoch": 0.004070556309362279, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6973493695259094, + "kl": 0.0, + "learning_rate": 1.0999999999999999e-07, + "loss": -0.004, + "num_tokens": 409031.0, + "reward": 0.7041667699813843, + "reward_std": 0.44965147972106934, + "rewards/correctness_reward_func/mean": 0.46666666865348816, + "rewards/correctness_reward_func/std": 0.4119429290294647, + "rewards/format_reward_func/mean": 0.23750001192092896, + "rewards/format_reward_func/std": 0.07723929733037949, + "step": 12 + }, + { + "completion_length": 1128.7500610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2031.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1128.75, + "completions/mean_terminated_length": 1128.75, + "completions/min_length": 639.0, + "completions/min_terminated_length": 639.0, + "epoch": 0.004409769335142469, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.07278123497962952, + "kl": 0.0, + "learning_rate": 1.2e-07, + "loss": -0.001, + "num_tokens": 434738.0, + "reward": 0.762499988079071, + "reward_std": 0.04107918590307236, + "rewards/correctness_reward_func/mean": 0.5, + "rewards/correctness_reward_func/std": 0.5222329497337341, + "rewards/format_reward_func/mean": 0.26250001788139343, + "rewards/format_reward_func/std": 0.06784005463123322, + "step": 13 + }, + { + "completion_length": 1370.1666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3375.0, + "completions/max_terminated_length": 3375.0, + "completions/mean_length": 1370.166748046875, + "completions/mean_terminated_length": 1370.166748046875, + "completions/min_length": 520.0, + "completions/min_terminated_length": 520.0, + "epoch": 0.0047489823609226595, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4310975968837738, + "kl": 0.0, + "learning_rate": 1.3e-07, + "loss": 0.0046, + "num_tokens": 464644.0, + "reward": 1.0541667938232422, + "reward_std": 0.24682849645614624, + "rewards/correctness_reward_func/mean": 0.7666666507720947, + "rewards/correctness_reward_func/std": 0.2534608840942383, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 14 + }, + { + "completion_length": 2577.416748046875, + "completions/clipped_ratio": 0.16666666666666663, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5694.0, + "completions/mean_length": 3675.58349609375, + "completions/mean_terminated_length": 3092.900146484375, + "completions/min_length": 1257.0, + "completions/min_terminated_length": 1257.0, + "epoch": 0.00508819538670285, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.4105437397956848, + "kl": NaN, + "learning_rate": 1.4e-07, + "loss": -0.0759, + "num_tokens": 511977.0, + "reward": 0.9291667938232422, + "reward_std": 0.26571446657180786, + "rewards/correctness_reward_func/mean": 0.6666666865348816, + "rewards/correctness_reward_func/std": 0.31139957904815674, + "rewards/format_reward_func/mean": 0.26250001788139343, + "rewards/format_reward_func/std": 0.09323723614215851, + "step": 15 + }, + { + "completion_length": 2431.2501220703125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4410.0, + "completions/max_terminated_length": 4410.0, + "completions/mean_length": 2431.25, + "completions/mean_terminated_length": 2431.25, + "completions/min_length": 1113.0, + "completions/min_terminated_length": 1113.0, + "epoch": 0.005427408412483039, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.087972640991211, + "kl": 0.0, + "learning_rate": 1.5e-07, + "loss": -0.0046, + "num_tokens": 552924.0, + "reward": 0.6625000238418579, + "reward_std": 0.26881134510040283, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4972652792930603, + "rewards/format_reward_func/mean": 0.26250001788139343, + "rewards/format_reward_func/std": 0.06784005463123322, + "step": 16 + }, + { + "completion_length": 1940.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3607.0, + "completions/max_terminated_length": 3607.0, + "completions/mean_length": 1940.5, + "completions/mean_terminated_length": 1940.5, + "completions/min_length": 601.0, + "completions/min_terminated_length": 601.0, + "epoch": 0.005766621438263229, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6243698000907898, + "kl": 0.0, + "learning_rate": 1.6e-07, + "loss": -0.0143, + "num_tokens": 585378.0, + "reward": 0.8791667819023132, + "reward_std": 0.45129260420799255, + "rewards/correctness_reward_func/mean": 0.6166666150093079, + "rewards/correctness_reward_func/std": 0.37618502974510193, + "rewards/format_reward_func/mean": 0.26250001788139343, + "rewards/format_reward_func/std": 0.06784005463123322, + "step": 17 + }, + { + "completion_length": 913.1666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1744.0, + "completions/max_terminated_length": 1744.0, + "completions/mean_length": 913.1666870117188, + "completions/mean_terminated_length": 913.1666870117188, + "completions/min_length": 411.0, + "completions/min_terminated_length": 411.0, + "epoch": 0.006105834464043419, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.548653244972229, + "kl": 0.0, + "learning_rate": 1.7000000000000001e-07, + "loss": 0.012, + "num_tokens": 606578.0, + "reward": 1.0875000953674316, + "reward_std": 0.4288218021392822, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.38138505816459656, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 18 + }, + { + "completion_length": 859.7500305175781, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3215.0, + "completions/max_terminated_length": 3215.0, + "completions/mean_length": 859.75, + "completions/mean_terminated_length": 859.75, + "completions/min_length": 374.0, + "completions/min_terminated_length": 374.0, + "epoch": 0.006445047489823609, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5170750617980957, + "kl": 0.0, + "learning_rate": 1.8e-07, + "loss": -0.0076, + "num_tokens": 627719.0, + "reward": 0.8833333849906921, + "reward_std": 0.30571478605270386, + "rewards/correctness_reward_func/mean": 0.6333333253860474, + "rewards/correctness_reward_func/std": 0.3892494738101959, + "rewards/format_reward_func/mean": 0.25, + "rewards/format_reward_func/std": 0.07385490089654922, + "step": 19 + }, + { + "completion_length": 1683.7500610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3455.0, + "completions/max_terminated_length": 3455.0, + "completions/mean_length": 1683.75, + "completions/mean_terminated_length": 1683.75, + "completions/min_length": 768.0, + "completions/min_terminated_length": 768.0, + "epoch": 0.0067842605156037995, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.09887401759624481, + "kl": 0.0, + "learning_rate": 1.8999999999999998e-07, + "loss": -0.001, + "num_tokens": 657884.0, + "reward": 0.762499988079071, + "reward_std": 0.041079193353652954, + "rewards/correctness_reward_func/mean": 0.5, + "rewards/correctness_reward_func/std": 0.5222329497337341, + "rewards/format_reward_func/mean": 0.26250001788139343, + "rewards/format_reward_func/std": 0.06784005463123322, + "step": 20 + }, + { + "completion_length": 2319.166748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4392.0, + "completions/max_terminated_length": 4392.0, + "completions/mean_length": 2319.166748046875, + "completions/mean_terminated_length": 2319.166748046875, + "completions/min_length": 1144.0, + "completions/min_terminated_length": 1144.0, + "epoch": 0.007123473541383989, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4928167760372162, + "kl": 0.0, + "learning_rate": 2e-07, + "loss": 0.0001, + "num_tokens": 695722.0, + "reward": 1.070833444595337, + "reward_std": 0.2486901879310608, + "rewards/correctness_reward_func/mean": 0.7833333015441895, + "rewards/correctness_reward_func/std": 0.26227444410324097, + "rewards/format_reward_func/mean": 0.2875000238418579, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 21 + }, + { + "completion_length": 723.5833435058594, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1397.0, + "completions/max_terminated_length": 1397.0, + "completions/mean_length": 723.5833740234375, + "completions/mean_terminated_length": 723.5833740234375, + "completions/min_length": 402.0, + "completions/min_terminated_length": 402.0, + "epoch": 0.007462686567164179, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5503113865852356, + "kl": 0.0, + "learning_rate": 2.0999999999999997e-07, + "loss": -0.0044, + "num_tokens": 716675.0, + "reward": 0.8333333730697632, + "reward_std": 0.5088584423065186, + "rewards/correctness_reward_func/mean": 0.5833333134651184, + "rewards/correctness_reward_func/std": 0.4386619031429291, + "rewards/format_reward_func/mean": 0.25, + "rewards/format_reward_func/std": 0.07385490089654922, + "step": 22 + }, + { + "completion_length": 1546.5833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3261.0, + "completions/max_terminated_length": 3261.0, + "completions/mean_length": 1546.5833740234375, + "completions/mean_terminated_length": 1546.5833740234375, + "completions/min_length": 726.0, + "completions/min_terminated_length": 726.0, + "epoch": 0.007801899592944369, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.530082643032074, + "kl": 0.0, + "learning_rate": 2.1999999999999998e-07, + "loss": -0.0047, + "num_tokens": 744030.0, + "reward": 0.5625, + "reward_std": 0.295512855052948, + "rewards/correctness_reward_func/mean": 0.29999998211860657, + "rewards/correctness_reward_func/std": 0.4472135901451111, + "rewards/format_reward_func/mean": 0.26250001788139343, + "rewards/format_reward_func/std": 0.06784005463123322, + "step": 23 + }, + { + "completion_length": 3152.0833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5800.0, + "completions/max_terminated_length": 5800.0, + "completions/mean_length": 3152.08349609375, + "completions/mean_terminated_length": 3152.08349609375, + "completions/min_length": 1237.0, + "completions/min_terminated_length": 1237.0, + "epoch": 0.008141112618724558, + "frac_reward_zero_std": 1.0, + "grad_norm": 5.056385816715192e-07, + "kl": 0.0, + "learning_rate": 2.3e-07, + "loss": 0.0, + "num_tokens": 794473.0, + "reward": 1.1000001430511475, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 24 + }, + { + "completion_length": 540.8333435058594, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1220.0, + "completions/max_terminated_length": 1220.0, + "completions/mean_length": 540.8333740234375, + "completions/mean_terminated_length": 540.8333740234375, + "completions/min_length": 330.0, + "completions/min_terminated_length": 330.0, + "epoch": 0.008480325644504749, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.24769005179405212, + "kl": 0.0, + "learning_rate": 2.4e-07, + "loss": -0.0003, + "num_tokens": 811541.0, + "reward": 0.7666667699813843, + "reward_std": 0.1632993221282959, + "rewards/correctness_reward_func/mean": 0.46666666865348816, + "rewards/correctness_reward_func/std": 0.4119429290294647, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 25 + }, + { + "completion_length": 1279.666748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2073.0, + "completions/max_terminated_length": 2073.0, + "completions/mean_length": 1279.666748046875, + "completions/mean_terminated_length": 1279.666748046875, + "completions/min_length": 647.0, + "completions/min_terminated_length": 647.0, + "epoch": 0.008819538670284939, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1748448610305786, + "kl": 0.0, + "learning_rate": 2.5e-07, + "loss": 0.0003, + "num_tokens": 836509.0, + "reward": 1.1666667461395264, + "reward_std": 0.09559705853462219, + "rewards/correctness_reward_func/mean": 0.8666666150093079, + "rewards/correctness_reward_func/std": 0.0984731912612915, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 26 + }, + { + "completion_length": 1812.5001220703125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2724.0, + "completions/max_terminated_length": 2724.0, + "completions/mean_length": 1812.5, + "completions/mean_terminated_length": 1812.5, + "completions/min_length": 861.0, + "completions/min_terminated_length": 861.0, + "epoch": 0.009158751696065129, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.10540489852428436, + "kl": 0.0, + "learning_rate": 2.6e-07, + "loss": -0.0019, + "num_tokens": 868297.0, + "reward": 1.1541666984558105, + "reward_std": 0.05103101581335068, + "rewards/correctness_reward_func/mean": 0.8666666150093079, + "rewards/correctness_reward_func/std": 0.0984731912612915, + "rewards/format_reward_func/mean": 0.2875000238418579, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 27 + }, + { + "completion_length": 1048.0833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2830.0, + "completions/max_terminated_length": 2830.0, + "completions/mean_length": 1048.0833740234375, + "completions/mean_terminated_length": 1048.0833740234375, + "completions/min_length": 426.0, + "completions/min_terminated_length": 426.0, + "epoch": 0.009497964721845319, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6303296089172363, + "kl": 0.0, + "learning_rate": 2.7e-07, + "loss": -0.0051, + "num_tokens": 891440.0, + "reward": 0.9958333373069763, + "reward_std": 0.41863998770713806, + "rewards/correctness_reward_func/mean": 0.7333332896232605, + "rewards/correctness_reward_func/std": 0.3550501763820648, + "rewards/format_reward_func/mean": 0.26250001788139343, + "rewards/format_reward_func/std": 0.06784005463123322, + "step": 28 + }, + { + "completion_length": 1568.0833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3112.0, + "completions/max_terminated_length": 3112.0, + "completions/mean_length": 1568.0833740234375, + "completions/mean_terminated_length": 1568.0833740234375, + "completions/min_length": 1012.0, + "completions/min_terminated_length": 1012.0, + "epoch": 0.00983717774762551, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7948746085166931, + "kl": 0.0, + "learning_rate": 2.8e-07, + "loss": 0.0124, + "num_tokens": 922821.0, + "reward": 0.783333420753479, + "reward_std": 0.45408618450164795, + "rewards/correctness_reward_func/mean": 0.5333333611488342, + "rewards/correctness_reward_func/std": 0.3938928246498108, + "rewards/format_reward_func/mean": 0.25, + "rewards/format_reward_func/std": 0.07385490089654922, + "step": 29 + }, + { + "completion_length": 1348.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2316.0, + "completions/max_terminated_length": 2316.0, + "completions/mean_length": 1348.0, + "completions/mean_terminated_length": 1348.0, + "completions/min_length": 635.0, + "completions/min_terminated_length": 635.0, + "epoch": 0.0101763907734057, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.48874858021736145, + "kl": 0.0, + "learning_rate": 2.9e-07, + "loss": -0.0019, + "num_tokens": 948873.0, + "reward": 0.8166667819023132, + "reward_std": 0.2557638883590698, + "rewards/correctness_reward_func/mean": 0.5166666507720947, + "rewards/correctness_reward_func/std": 0.4628632962703705, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 30 + }, + { + "completion_length": 699.8333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1161.0, + "completions/max_terminated_length": 1161.0, + "completions/mean_length": 699.8333740234375, + "completions/mean_terminated_length": 699.8333740234375, + "completions/min_length": 345.0, + "completions/min_terminated_length": 345.0, + "epoch": 0.01051560379918589, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.5726268291473389, + "kl": 0.0, + "learning_rate": 3e-07, + "loss": -0.0029, + "num_tokens": 971413.0, + "reward": 0.887499988079071, + "reward_std": 0.26016825437545776, + "rewards/correctness_reward_func/mean": 0.699999988079071, + "rewards/correctness_reward_func/std": 0.43064433336257935, + "rewards/format_reward_func/mean": 0.1875, + "rewards/format_reward_func/std": 0.06784005463123322, + "step": 31 + }, + { + "completion_length": 3761.58349609375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5482.0, + "completions/max_terminated_length": 5482.0, + "completions/mean_length": 3761.58349609375, + "completions/mean_terminated_length": 3761.58349609375, + "completions/min_length": 1546.0, + "completions/min_terminated_length": 1546.0, + "epoch": 0.010854816824966078, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1386401653289795, + "kl": 0.0, + "learning_rate": 3.1e-07, + "loss": -0.0138, + "num_tokens": 1028342.0, + "reward": 0.7250000834465027, + "reward_std": 0.4932064414024353, + "rewards/correctness_reward_func/mean": 0.44999995827674866, + "rewards/correctness_reward_func/std": 0.47577691078186035, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.05838742107152939, + "step": 32 + }, + { + "completion_length": 1955.3333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3581.0, + "completions/max_terminated_length": 3581.0, + "completions/mean_length": 1955.3333740234375, + "completions/mean_terminated_length": 1955.3333740234375, + "completions/min_length": 715.0, + "completions/min_terminated_length": 715.0, + "epoch": 0.011194029850746268, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.5336142778396606, + "kl": 0.0, + "learning_rate": 3.2e-07, + "loss": 0.0114, + "num_tokens": 1064280.0, + "reward": 1.0375001430511475, + "reward_std": 0.20600366592407227, + "rewards/correctness_reward_func/mean": 0.75, + "rewards/correctness_reward_func/std": 0.24308621883392334, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 33 + }, + { + "completion_length": 2596.0833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5696.0, + "completions/max_terminated_length": 5696.0, + "completions/mean_length": 2596.08349609375, + "completions/mean_terminated_length": 2596.08349609375, + "completions/min_length": 734.0, + "completions/min_terminated_length": 734.0, + "epoch": 0.011533242876526458, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.12288849800825119, + "kl": 0.0, + "learning_rate": 3.3e-07, + "loss": 0.0018, + "num_tokens": 1106305.0, + "reward": 0.6750000715255737, + "reward_std": 0.03872983902692795, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.05838742107152939, + "step": 34 + }, + { + "completion_length": 762.0833435058594, + "completions/clipped_ratio": 0.0, + "completions/max_length": 931.0, + "completions/max_terminated_length": 931.0, + "completions/mean_length": 762.0833740234375, + "completions/mean_terminated_length": 762.0833740234375, + "completions/min_length": 600.0, + "completions/min_terminated_length": 600.0, + "epoch": 0.011872455902306648, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3861258625984192, + "kl": 0.0, + "learning_rate": 3.4000000000000003e-07, + "loss": -0.0014, + "num_tokens": 1127090.0, + "reward": 0.595833420753479, + "reward_std": 0.23264777660369873, + "rewards/correctness_reward_func/mean": 0.3333333432674408, + "rewards/correctness_reward_func/std": 0.4119429290294647, + "rewards/format_reward_func/mean": 0.26250001788139343, + "rewards/format_reward_func/std": 0.06784005463123322, + "step": 35 + }, + { + "completion_length": 1244.5000610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2654.0, + "completions/max_terminated_length": 2654.0, + "completions/mean_length": 1244.5, + "completions/mean_terminated_length": 1244.5, + "completions/min_length": 488.0, + "completions/min_terminated_length": 488.0, + "epoch": 0.012211668928086838, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.5422102212905884, + "kl": 0.0, + "learning_rate": 3.5e-07, + "loss": 0.0022, + "num_tokens": 1153376.0, + "reward": 0.8583333492279053, + "reward_std": 0.21946904063224792, + "rewards/correctness_reward_func/mean": 0.5833333134651184, + "rewards/correctness_reward_func/std": 0.5149286389350891, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.05838742107152939, + "step": 36 + }, + { + "completion_length": 1497.5833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3081.0, + "completions/max_terminated_length": 3081.0, + "completions/mean_length": 1497.5833740234375, + "completions/mean_terminated_length": 1497.5833740234375, + "completions/min_length": 513.0, + "completions/min_terminated_length": 513.0, + "epoch": 0.012550881953867029, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6115618348121643, + "kl": 0.0, + "learning_rate": 3.6e-07, + "loss": 0.0252, + "num_tokens": 1183983.0, + "reward": 0.9916667938232422, + "reward_std": 0.2866472899913788, + "rewards/correctness_reward_func/mean": 0.7166666984558105, + "rewards/correctness_reward_func/std": 0.3459725081920624, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.05838742107152939, + "step": 37 + }, + { + "completion_length": 1688.7500610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5040.0, + "completions/max_terminated_length": 5040.0, + "completions/mean_length": 1688.75, + "completions/mean_terminated_length": 1688.75, + "completions/min_length": 436.0, + "completions/min_terminated_length": 436.0, + "epoch": 0.012890094979647219, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.9230490922927856, + "kl": 0.0, + "learning_rate": 3.7e-07, + "loss": -0.005, + "num_tokens": 1215972.0, + "reward": 0.5, + "reward_std": 0.21908903121948242, + "rewards/correctness_reward_func/mean": 0.20000000298023224, + "rewards/correctness_reward_func/std": 0.36181363463401794, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 38 + }, + { + "completion_length": 1880.5833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3490.0, + "completions/max_terminated_length": 3490.0, + "completions/mean_length": 1880.5833740234375, + "completions/mean_terminated_length": 1880.5833740234375, + "completions/min_length": 737.0, + "completions/min_terminated_length": 737.0, + "epoch": 0.013229308005427409, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.08842471987009048, + "kl": 0.0, + "learning_rate": 3.7999999999999996e-07, + "loss": 0.0011, + "num_tokens": 1250587.0, + "reward": 1.1875, + "reward_std": 0.03061862848699093, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.2875000238418579, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 39 + }, + { + "completion_length": 1623.5833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3097.0, + "completions/max_terminated_length": 3097.0, + "completions/mean_length": 1623.5833740234375, + "completions/mean_terminated_length": 1623.5833740234375, + "completions/min_length": 996.0, + "completions/min_terminated_length": 996.0, + "epoch": 0.013568521031207599, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7013817429542542, + "kl": 0.0, + "learning_rate": 3.8999999999999997e-07, + "loss": 0.0118, + "num_tokens": 1284230.0, + "reward": 1.0250000953674316, + "reward_std": 0.42866072058677673, + "rewards/correctness_reward_func/mean": 0.75, + "rewards/correctness_reward_func/std": 0.36306774616241455, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.05838742107152939, + "step": 40 + }, + { + "completion_length": 1975.3333740234375, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 3728.0, + "completions/mean_length": 2524.416748046875, + "completions/mean_terminated_length": 2154.9091796875, + "completions/min_length": 724.0, + "completions/min_terminated_length": 724.0, + "epoch": 0.013907734056987787, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6931108236312866, + "kl": NaN, + "learning_rate": 4e-07, + "loss": -0.0256, + "num_tokens": 1318170.0, + "reward": 0.7166666984558105, + "reward_std": 0.4313082695007324, + "rewards/correctness_reward_func/mean": 0.46666669845581055, + "rewards/correctness_reward_func/std": 0.4119429290294647, + "rewards/format_reward_func/mean": 0.25, + "rewards/format_reward_func/std": 0.07385490089654922, + "step": 41 + }, + { + "completion_length": 1261.416748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2707.0, + "completions/max_terminated_length": 2707.0, + "completions/mean_length": 1261.416748046875, + "completions/mean_terminated_length": 1261.416748046875, + "completions/min_length": 644.0, + "completions/min_terminated_length": 644.0, + "epoch": 0.014246947082767978, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.532822847366333, + "kl": 0.0, + "learning_rate": 4.0999999999999994e-07, + "loss": 0.0143, + "num_tokens": 1341893.0, + "reward": 1.0875000953674316, + "reward_std": 0.23008152842521667, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.2696799635887146, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 42 + }, + { + "completion_length": 1444.25, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 4008.0, + "completions/mean_length": 1993.3333740234375, + "completions/mean_terminated_length": 1575.5455322265625, + "completions/min_length": 525.0, + "completions/min_terminated_length": 525.0, + "epoch": 0.014586160108548168, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.7497459650039673, + "kl": NaN, + "learning_rate": 4.1999999999999995e-07, + "loss": -0.0279, + "num_tokens": 1370816.0, + "reward": 1.008333444595337, + "reward_std": 0.22453658282756805, + "rewards/correctness_reward_func/mean": 0.7333333492279053, + "rewards/correctness_reward_func/std": 0.23094011843204498, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.08660253882408142, + "step": 43 + }, + { + "completion_length": 1885.0833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4084.0, + "completions/max_terminated_length": 4084.0, + "completions/mean_length": 1885.0833740234375, + "completions/mean_terminated_length": 1885.0833740234375, + "completions/min_length": 452.0, + "completions/min_terminated_length": 452.0, + "epoch": 0.014925373134328358, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.11372081935405731, + "kl": 0.0, + "learning_rate": 4.2999999999999996e-07, + "loss": -0.0021, + "num_tokens": 1403379.0, + "reward": 0.6625000834465027, + "reward_std": 0.041079193353652954, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.26250001788139343, + "rewards/format_reward_func/std": 0.06784005463123322, + "step": 44 + }, + { + "completion_length": 967.0000305175781, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2843.0, + "completions/max_terminated_length": 2843.0, + "completions/mean_length": 967.0, + "completions/mean_terminated_length": 967.0, + "completions/min_length": 478.0, + "completions/min_terminated_length": 478.0, + "epoch": 0.015264586160108548, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4614796042442322, + "kl": 0.0, + "learning_rate": 4.3999999999999997e-07, + "loss": -0.0099, + "num_tokens": 1427145.0, + "reward": 0.5708333849906921, + "reward_std": 0.23264777660369873, + "rewards/correctness_reward_func/mean": 0.3333333432674408, + "rewards/correctness_reward_func/std": 0.4119429290294647, + "rewards/format_reward_func/mean": 0.23750001192092896, + "rewards/format_reward_func/std": 0.07723929733037949, + "step": 45 + }, + { + "completion_length": 1987.5833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5091.0, + "completions/max_terminated_length": 5091.0, + "completions/mean_length": 1987.5833740234375, + "completions/mean_terminated_length": 1987.5833740234375, + "completions/min_length": 601.0, + "completions/min_terminated_length": 601.0, + "epoch": 0.015603799185888738, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.45858681201934814, + "kl": 0.0, + "learning_rate": 4.5e-07, + "loss": -0.0061, + "num_tokens": 1462708.0, + "reward": 1.133333444595337, + "reward_std": 0.1632993221282959, + "rewards/correctness_reward_func/mean": 0.8333333134651184, + "rewards/correctness_reward_func/std": 0.2806917726993561, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 46 + }, + { + "completion_length": 2157.0001220703125, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 3806.0, + "completions/mean_length": 2706.08349609375, + "completions/mean_terminated_length": 2353.091064453125, + "completions/min_length": 1247.0, + "completions/min_terminated_length": 1247.0, + "epoch": 0.01594301221166893, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5918885469436646, + "kl": NaN, + "learning_rate": 4.6e-07, + "loss": -0.0191, + "num_tokens": 1500250.0, + "reward": 0.9625000953674316, + "reward_std": 0.4404165744781494, + "rewards/correctness_reward_func/mean": 0.699999988079071, + "rewards/correctness_reward_func/std": 0.3357488214969635, + "rewards/format_reward_func/mean": 0.26250001788139343, + "rewards/format_reward_func/std": 0.09323723614215851, + "step": 47 + }, + { + "completion_length": 2361.1666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6359.0, + "completions/max_terminated_length": 6359.0, + "completions/mean_length": 2361.166748046875, + "completions/mean_terminated_length": 2361.166748046875, + "completions/min_length": 431.0, + "completions/min_terminated_length": 431.0, + "epoch": 0.016282225237449117, + "frac_reward_zero_std": 0.0, + "grad_norm": 8.699708938598633, + "kl": 0.0, + "learning_rate": 4.6999999999999995e-07, + "loss": -0.0034, + "num_tokens": 1538424.0, + "reward": 0.6791667342185974, + "reward_std": 0.08190402388572693, + "rewards/correctness_reward_func/mean": 0.4166666567325592, + "rewards/correctness_reward_func/std": 0.4386619031429291, + "rewards/format_reward_func/mean": 0.26250001788139343, + "rewards/format_reward_func/std": 0.06784005463123322, + "step": 48 + }, + { + "completion_length": 3042.5833740234375, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 6276.0, + "completions/mean_length": 3591.666748046875, + "completions/mean_terminated_length": 3319.181884765625, + "completions/min_length": 962.0, + "completions/min_terminated_length": 962.0, + "epoch": 0.01662143826322931, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9630971550941467, + "kl": NaN, + "learning_rate": 4.8e-07, + "loss": -0.0544, + "num_tokens": 1590673.0, + "reward": 0.7124999761581421, + "reward_std": 0.49692243337631226, + "rewards/correctness_reward_func/mean": 0.45000001788139343, + "rewards/correctness_reward_func/std": 0.47577688097953796, + "rewards/format_reward_func/mean": 0.26250001788139343, + "rewards/format_reward_func/std": 0.06784005463123322, + "step": 49 + }, + { + "completion_length": 930.0000305175781, + "completions/clipped_ratio": 0.5, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 2777.0, + "completions/mean_length": 4224.5, + "completions/mean_terminated_length": 1860.0, + "completions/min_length": 740.0, + "completions/min_terminated_length": 740.0, + "epoch": 0.016960651289009497, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.09431289881467819, + "kl": NaN, + "learning_rate": 4.9e-07, + "loss": -0.0007, + "num_tokens": 1614277.0, + "reward": 0.5666667222976685, + "reward_std": 0.040824826806783676, + "rewards/correctness_reward_func/mean": 0.4166666567325592, + "rewards/correctness_reward_func/std": 0.4386619031429291, + "rewards/format_reward_func/mean": 0.15000000596046448, + "rewards/format_reward_func/std": 0.15666989982128143, + "step": 50 + }, + { + "completion_length": 1810.2500610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3579.0, + "completions/max_terminated_length": 3579.0, + "completions/mean_length": 1810.25, + "completions/mean_terminated_length": 1810.25, + "completions/min_length": 668.0, + "completions/min_terminated_length": 668.0, + "epoch": 0.01729986431478969, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6033844947814941, + "kl": 0.0, + "learning_rate": 5e-07, + "loss": 0.0042, + "num_tokens": 1650094.0, + "reward": 0.9291666746139526, + "reward_std": 0.38409334421157837, + "rewards/correctness_reward_func/mean": 0.6666666865348816, + "rewards/correctness_reward_func/std": 0.31139957904815674, + "rewards/format_reward_func/mean": 0.26250001788139343, + "rewards/format_reward_func/std": 0.06784005463123322, + "step": 51 + }, + { + "completion_length": 3461.2501220703125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5404.0, + "completions/max_terminated_length": 5404.0, + "completions/mean_length": 3461.25, + "completions/mean_terminated_length": 3461.25, + "completions/min_length": 2344.0, + "completions/min_terminated_length": 2344.0, + "epoch": 0.017639077340569877, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9066393971443176, + "kl": 0.0, + "learning_rate": 4.998274672187715e-07, + "loss": 0.0169, + "num_tokens": 1707199.0, + "reward": 0.8625000715255737, + "reward_std": 0.43920692801475525, + "rewards/correctness_reward_func/mean": 0.6000000238418579, + "rewards/correctness_reward_func/std": 0.36181360483169556, + "rewards/format_reward_func/mean": 0.26250001788139343, + "rewards/format_reward_func/std": 0.06784005463123322, + "step": 52 + }, + { + "completion_length": 621.6666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1293.0, + "completions/max_terminated_length": 1293.0, + "completions/mean_length": 621.6666870117188, + "completions/mean_terminated_length": 621.6666870117188, + "completions/min_length": 366.0, + "completions/min_terminated_length": 366.0, + "epoch": 0.01797829036635007, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07819852232933044, + "kl": 0.0, + "learning_rate": 4.996549344375431e-07, + "loss": -0.0013, + "num_tokens": 1725603.0, + "reward": 0.7124999761581421, + "reward_std": 0.06934843957424164, + "rewards/correctness_reward_func/mean": 0.5, + "rewards/correctness_reward_func/std": 0.5222329497337341, + "rewards/format_reward_func/mean": 0.21250002086162567, + "rewards/format_reward_func/std": 0.07723929733037949, + "step": 53 + }, + { + "completion_length": 2232.0000610351562, + "completions/clipped_ratio": 0.16666666666666663, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5477.0, + "completions/mean_length": 3330.166748046875, + "completions/mean_terminated_length": 2678.400146484375, + "completions/min_length": 944.0, + "completions/min_terminated_length": 944.0, + "epoch": 0.018317503392130258, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5813143849372864, + "kl": NaN, + "learning_rate": 4.994824016563146e-07, + "loss": -0.0049, + "num_tokens": 1765119.0, + "reward": 0.5583333373069763, + "reward_std": 0.23038136959075928, + "rewards/correctness_reward_func/mean": 0.3333333432674408, + "rewards/correctness_reward_func/std": 0.4119429290294647, + "rewards/format_reward_func/mean": 0.2250000238418579, + "rewards/format_reward_func/std": 0.11965861171483994, + "step": 54 + }, + { + "completion_length": 529.9166717529297, + "completions/clipped_ratio": 0.0, + "completions/max_length": 700.0, + "completions/max_terminated_length": 700.0, + "completions/mean_length": 529.9166870117188, + "completions/mean_terminated_length": 529.9166870117188, + "completions/min_length": 359.0, + "completions/min_terminated_length": 359.0, + "epoch": 0.018656716417910446, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.04094817116856575, + "kl": 0.0, + "learning_rate": 4.993098688750863e-07, + "loss": 0.0001, + "num_tokens": 1786892.0, + "reward": 1.1375000476837158, + "reward_std": 0.030618613585829735, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.23750001192092896, + "rewards/format_reward_func/std": 0.07723929733037949, + "step": 55 + }, + { + "completion_length": 2408.58349609375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4855.0, + "completions/max_terminated_length": 4855.0, + "completions/mean_length": 2408.58349609375, + "completions/mean_terminated_length": 2408.58349609375, + "completions/min_length": 1078.0, + "completions/min_terminated_length": 1078.0, + "epoch": 0.018995929443690638, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.11791203916072845, + "kl": 0.0, + "learning_rate": 4.991373360938578e-07, + "loss": 0.0014, + "num_tokens": 1823361.0, + "reward": 0.7875000834465027, + "reward_std": 0.03061862289905548, + "rewards/correctness_reward_func/mean": 0.5, + "rewards/correctness_reward_func/std": 0.5222329497337341, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 56 + }, + { + "completion_length": 499.4166717529297, + "completions/clipped_ratio": 0.0, + "completions/max_length": 686.0, + "completions/max_terminated_length": 686.0, + "completions/mean_length": 499.41668701171875, + "completions/mean_terminated_length": 499.41668701171875, + "completions/min_length": 378.0, + "completions/min_terminated_length": 378.0, + "epoch": 0.019335142469470826, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.9338906415432575e-07, + "kl": 0.0, + "learning_rate": 4.989648033126294e-07, + "loss": 0.0, + "num_tokens": 1837562.0, + "reward": 1.1000001430511475, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 57 + }, + { + "completion_length": 1457.5000610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3312.0, + "completions/max_terminated_length": 3312.0, + "completions/mean_length": 1457.5, + "completions/mean_terminated_length": 1457.5, + "completions/min_length": 528.0, + "completions/min_terminated_length": 528.0, + "epoch": 0.01967435549525102, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.41315922141075134, + "kl": 0.0, + "learning_rate": 4.98792270531401e-07, + "loss": 0.0077, + "num_tokens": 1867334.0, + "reward": 0.9541667699813843, + "reward_std": 0.22716552019119263, + "rewards/correctness_reward_func/mean": 0.6666666865348816, + "rewards/correctness_reward_func/std": 0.31139957904815674, + "rewards/format_reward_func/mean": 0.2875000238418579, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 58 + }, + { + "completion_length": 2239.666748046875, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 4822.0, + "completions/mean_length": 2788.75, + "completions/mean_terminated_length": 2443.272705078125, + "completions/min_length": 1249.0, + "completions/min_terminated_length": 1249.0, + "epoch": 0.020013568521031207, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6937410831451416, + "kl": NaN, + "learning_rate": 4.986197377501725e-07, + "loss": -0.0354, + "num_tokens": 1910866.0, + "reward": 0.6125000715255737, + "reward_std": 0.26673299074172974, + "rewards/correctness_reward_func/mean": 0.3500000238418579, + "rewards/correctness_reward_func/std": 0.4358898997306824, + "rewards/format_reward_func/mean": 0.26250001788139343, + "rewards/format_reward_func/std": 0.09323723614215851, + "step": 59 + }, + { + "completion_length": 740.2500305175781, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1572.0, + "completions/max_terminated_length": 1572.0, + "completions/mean_length": 740.25, + "completions/mean_terminated_length": 740.25, + "completions/min_length": 398.0, + "completions/min_terminated_length": 398.0, + "epoch": 0.0203527815468114, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09556394070386887, + "kl": 0.0, + "learning_rate": 4.984472049689441e-07, + "loss": -0.0012, + "num_tokens": 1933849.0, + "reward": 1.2208333015441895, + "reward_std": 0.07144342362880707, + "rewards/correctness_reward_func/mean": 0.9833333492279053, + "rewards/correctness_reward_func/std": 0.05773502215743065, + "rewards/format_reward_func/mean": 0.23750001192092896, + "rewards/format_reward_func/std": 0.07723929733037949, + "step": 60 + }, + { + "completion_length": 546.4166717529297, + "completions/clipped_ratio": 0.5, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 1595.0, + "completions/mean_length": 3840.916748046875, + "completions/mean_terminated_length": 1092.8333740234375, + "completions/min_length": 628.0, + "completions/min_terminated_length": 628.0, + "epoch": 0.020691994572591587, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.37896645069122314, + "kl": NaN, + "learning_rate": 4.982746721877156e-07, + "loss": 0.006, + "num_tokens": 1950060.0, + "reward": 0.4833333492279053, + "reward_std": 0.22453658282756805, + "rewards/correctness_reward_func/mean": 0.3333333432674408, + "rewards/correctness_reward_func/std": 0.4119429290294647, + "rewards/format_reward_func/mean": 0.15000000596046448, + "rewards/format_reward_func/std": 0.1430193930864334, + "step": 61 + }, + { + "completion_length": 958.3333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1381.0, + "completions/max_terminated_length": 1381.0, + "completions/mean_length": 958.3333740234375, + "completions/mean_terminated_length": 958.3333740234375, + "completions/min_length": 633.0, + "completions/min_terminated_length": 633.0, + "epoch": 0.02103120759837178, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.059960030019283295, + "kl": 0.0, + "learning_rate": 4.981021394064872e-07, + "loss": 0.0007, + "num_tokens": 1975876.0, + "reward": 0.6875001788139343, + "reward_std": 0.03061862289905548, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 62 + }, + { + "completion_length": 1011.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2134.0, + "completions/max_terminated_length": 2134.0, + "completions/mean_length": 1011.5, + "completions/mean_terminated_length": 1011.5, + "completions/min_length": 520.0, + "completions/min_terminated_length": 520.0, + "epoch": 0.021370420624151967, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.14549453150903e-07, + "kl": 0.0, + "learning_rate": 4.979296066252588e-07, + "loss": 0.0, + "num_tokens": 2003572.0, + "reward": 1.1000001430511475, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 63 + }, + { + "completion_length": 2115.5833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4262.0, + "completions/max_terminated_length": 4262.0, + "completions/mean_length": 2115.58349609375, + "completions/mean_terminated_length": 2115.58349609375, + "completions/min_length": 690.0, + "completions/min_terminated_length": 690.0, + "epoch": 0.021709633649932156, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.780624583920144e-07, + "kl": 0.0, + "learning_rate": 4.977570738440303e-07, + "loss": 0.0, + "num_tokens": 2040947.0, + "reward": 1.2000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 64 + }, + { + "completion_length": 5106.33349609375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6488.0, + "completions/max_terminated_length": 6488.0, + "completions/mean_length": 5106.33349609375, + "completions/mean_terminated_length": 5106.33349609375, + "completions/min_length": 2990.0, + "completions/min_terminated_length": 2990.0, + "epoch": 0.022048846675712348, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3122785091400146, + "kl": 0.0, + "learning_rate": 4.975845410628019e-07, + "loss": -0.008, + "num_tokens": 2112147.0, + "reward": 0.7666666507720947, + "reward_std": 0.521875262260437, + "rewards/correctness_reward_func/mean": 0.5166666507720947, + "rewards/correctness_reward_func/std": 0.4628632962703705, + "rewards/format_reward_func/mean": 0.25, + "rewards/format_reward_func/std": 0.07385490089654922, + "step": 65 + }, + { + "completion_length": 2463.916748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4120.0, + "completions/max_terminated_length": 4120.0, + "completions/mean_length": 2463.916748046875, + "completions/mean_terminated_length": 2463.916748046875, + "completions/min_length": 899.0, + "completions/min_terminated_length": 899.0, + "epoch": 0.022388059701492536, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8168641924858093, + "kl": 0.0, + "learning_rate": 4.974120082815735e-07, + "loss": -0.0017, + "num_tokens": 2156594.0, + "reward": 0.6208333969116211, + "reward_std": 0.44094666838645935, + "rewards/correctness_reward_func/mean": 0.3333333432674408, + "rewards/correctness_reward_func/std": 0.4119429290294647, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 66 + }, + { + "completion_length": 2151.166748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4150.0, + "completions/max_terminated_length": 4150.0, + "completions/mean_length": 2151.166748046875, + "completions/mean_terminated_length": 2151.166748046875, + "completions/min_length": 526.0, + "completions/min_terminated_length": 526.0, + "epoch": 0.022727272727272728, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6010086536407471, + "kl": 0.0, + "learning_rate": 4.97239475500345e-07, + "loss": -0.0047, + "num_tokens": 2194228.0, + "reward": 1.0, + "reward_std": 0.25819891691207886, + "rewards/correctness_reward_func/mean": 0.699999988079071, + "rewards/correctness_reward_func/std": 0.3357488214969635, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 67 + }, + { + "completion_length": 690.7500305175781, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1049.0, + "completions/max_terminated_length": 1049.0, + "completions/mean_length": 690.75, + "completions/mean_terminated_length": 690.75, + "completions/min_length": 417.0, + "completions/min_terminated_length": 417.0, + "epoch": 0.023066485753052916, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.03490245342254639, + "kl": 0.0, + "learning_rate": 4.970669427191166e-07, + "loss": -0.0001, + "num_tokens": 2215375.0, + "reward": 0.6875001788139343, + "reward_std": 0.03061862289905548, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 68 + }, + { + "completion_length": 2392.8333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4364.0, + "completions/max_terminated_length": 4364.0, + "completions/mean_length": 2392.83349609375, + "completions/mean_terminated_length": 2392.83349609375, + "completions/min_length": 1438.0, + "completions/min_terminated_length": 1438.0, + "epoch": 0.023405698778833108, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14046727120876312, + "kl": 0.0, + "learning_rate": 4.968944099378881e-07, + "loss": 0.0027, + "num_tokens": 2255921.0, + "reward": 0.6916667819023132, + "reward_std": 0.07955466210842133, + "rewards/correctness_reward_func/mean": 0.4166666567325592, + "rewards/correctness_reward_func/std": 0.4386619031429291, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.05838742107152939, + "step": 69 + }, + { + "completion_length": 3071.8333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6239.0, + "completions/max_terminated_length": 6239.0, + "completions/mean_length": 3071.83349609375, + "completions/mean_terminated_length": 3071.83349609375, + "completions/min_length": 683.0, + "completions/min_terminated_length": 683.0, + "epoch": 0.023744911804613297, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.15312537550926208, + "kl": 0.0, + "learning_rate": 4.967218771566598e-07, + "loss": -0.0006, + "num_tokens": 2304747.0, + "reward": 0.27500003576278687, + "reward_std": 0.03872983902692795, + "rewards/correctness_reward_func/mean": 0.0, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.05838742107152939, + "step": 70 + }, + { + "completion_length": 2039.166748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4137.0, + "completions/max_terminated_length": 4137.0, + "completions/mean_length": 2039.166748046875, + "completions/mean_terminated_length": 2039.166748046875, + "completions/min_length": 865.0, + "completions/min_terminated_length": 865.0, + "epoch": 0.02408412483039349, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6391112208366394, + "kl": 0.0, + "learning_rate": 4.965493443754314e-07, + "loss": 0.0104, + "num_tokens": 2340371.0, + "reward": 1.1375000476837158, + "reward_std": 0.2848537564277649, + "rewards/correctness_reward_func/mean": 0.8499999642372131, + "rewards/correctness_reward_func/std": 0.28444522619247437, + "rewards/format_reward_func/mean": 0.2875000238418579, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 71 + }, + { + "completion_length": 2516.0833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4060.0, + "completions/max_terminated_length": 4060.0, + "completions/mean_length": 2516.08349609375, + "completions/mean_terminated_length": 2516.08349609375, + "completions/min_length": 964.0, + "completions/min_terminated_length": 964.0, + "epoch": 0.024423337856173677, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.47134825587272644, + "kl": 0.0, + "learning_rate": 4.963768115942029e-07, + "loss": -0.0085, + "num_tokens": 2383092.0, + "reward": 0.8416666984558105, + "reward_std": 0.20202915370464325, + "rewards/correctness_reward_func/mean": 0.5666666626930237, + "rewards/correctness_reward_func/std": 0.5033223032951355, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.05838742107152939, + "step": 72 + }, + { + "completion_length": 2631.8333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5089.0, + "completions/max_terminated_length": 5089.0, + "completions/mean_length": 2631.83349609375, + "completions/mean_terminated_length": 2631.83349609375, + "completions/min_length": 1202.0, + "completions/min_terminated_length": 1202.0, + "epoch": 0.024762550881953865, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5083867311477661, + "kl": 0.0, + "learning_rate": 4.962042788129745e-07, + "loss": 0.0137, + "num_tokens": 2426014.0, + "reward": 0.9833334684371948, + "reward_std": 0.2473839521408081, + "rewards/correctness_reward_func/mean": 0.6833333969116211, + "rewards/correctness_reward_func/std": 0.32427075505256653, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 73 + }, + { + "completion_length": 1854.1666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2987.0, + "completions/max_terminated_length": 2987.0, + "completions/mean_length": 1854.166748046875, + "completions/mean_terminated_length": 1854.166748046875, + "completions/min_length": 779.0, + "completions/min_terminated_length": 779.0, + "epoch": 0.025101763907734057, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.07933083176612854, + "kl": 0.0, + "learning_rate": 4.96031746031746e-07, + "loss": 0.001, + "num_tokens": 2461752.0, + "reward": 0.2875000238418579, + "reward_std": 0.03061862289905548, + "rewards/correctness_reward_func/mean": 0.0, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.2875000238418579, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 74 + }, + { + "completion_length": 1426.6666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3497.0, + "completions/max_terminated_length": 3497.0, + "completions/mean_length": 1426.666748046875, + "completions/mean_terminated_length": 1426.666748046875, + "completions/min_length": 627.0, + "completions/min_terminated_length": 627.0, + "epoch": 0.025440976933514246, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.802689778036438e-07, + "kl": 0.0, + "learning_rate": 4.958592132505176e-07, + "loss": 0.0, + "num_tokens": 2487038.0, + "reward": 1.2000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 75 + }, + { + "completion_length": 1519.9166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3167.0, + "completions/max_terminated_length": 3167.0, + "completions/mean_length": 1519.916748046875, + "completions/mean_terminated_length": 1519.916748046875, + "completions/min_length": 605.0, + "completions/min_terminated_length": 605.0, + "epoch": 0.025780189959294438, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.08159344643354416, + "kl": 0.0, + "learning_rate": 4.956866804692891e-07, + "loss": 0.0011, + "num_tokens": 2513005.0, + "reward": 1.1166667938232422, + "reward_std": 0.040824826806783676, + "rewards/correctness_reward_func/mean": 0.8166666626930237, + "rewards/correctness_reward_func/std": 0.05773502215743065, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 76 + }, + { + "completion_length": 1891.0834350585938, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 3705.0, + "completions/mean_length": 2440.166748046875, + "completions/mean_terminated_length": 2063.0, + "completions/min_length": 823.0, + "completions/min_terminated_length": 823.0, + "epoch": 0.026119402985074626, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.5463436245918274, + "kl": NaN, + "learning_rate": 4.955141476880607e-07, + "loss": -0.014, + "num_tokens": 2549246.0, + "reward": 0.8500000834465027, + "reward_std": 0.2752271890640259, + "rewards/correctness_reward_func/mean": 0.5999999642372131, + "rewards/correctness_reward_func/std": 0.36181363463401794, + "rewards/format_reward_func/mean": 0.25, + "rewards/format_reward_func/std": 0.09770084172487259, + "step": 77 + }, + { + "completion_length": 2267.5833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4360.0, + "completions/max_terminated_length": 4360.0, + "completions/mean_length": 2267.58349609375, + "completions/mean_terminated_length": 2267.58349609375, + "completions/min_length": 943.0, + "completions/min_terminated_length": 943.0, + "epoch": 0.026458616010854818, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3025602400302887, + "kl": 0.0, + "learning_rate": 4.953416149068323e-07, + "loss": 0.0118, + "num_tokens": 2588073.0, + "reward": 1.1583333015441895, + "reward_std": 0.10206204652786255, + "rewards/correctness_reward_func/mean": 0.8833333849906921, + "rewards/correctness_reward_func/std": 0.10298572480678558, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.05838742107152939, + "step": 78 + }, + { + "completion_length": 1166.0000610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4041.0, + "completions/max_terminated_length": 4041.0, + "completions/mean_length": 1166.0, + "completions/mean_terminated_length": 1166.0, + "completions/min_length": 448.0, + "completions/min_terminated_length": 448.0, + "epoch": 0.026797829036635006, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.4117281138896942, + "kl": 0.0, + "learning_rate": 4.951690821256038e-07, + "loss": 0.0011, + "num_tokens": 2612667.0, + "reward": 1.0208334922790527, + "reward_std": 0.19391795992851257, + "rewards/correctness_reward_func/mean": 0.7333333492279053, + "rewards/correctness_reward_func/std": 0.23094011843204498, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 79 + }, + { + "completion_length": 2583.83349609375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4004.0, + "completions/max_terminated_length": 4004.0, + "completions/mean_length": 2583.83349609375, + "completions/mean_terminated_length": 2583.83349609375, + "completions/min_length": 1107.0, + "completions/min_terminated_length": 1107.0, + "epoch": 0.027137042062415198, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5607545971870422, + "kl": 0.0, + "learning_rate": 4.949965493443754e-07, + "loss": 0.0271, + "num_tokens": 2658175.0, + "reward": 1.0, + "reward_std": 0.2581988573074341, + "rewards/correctness_reward_func/mean": 0.7000000476837158, + "rewards/correctness_reward_func/std": 0.3357488214969635, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 80 + }, + { + "completion_length": 1255.3333435058594, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3033.0, + "completions/max_terminated_length": 3033.0, + "completions/mean_length": 1255.3333740234375, + "completions/mean_terminated_length": 1255.3333740234375, + "completions/min_length": 460.0, + "completions/min_terminated_length": 460.0, + "epoch": 0.027476255088195387, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.32516950368881226, + "kl": 0.0, + "learning_rate": 4.94824016563147e-07, + "loss": -0.0018, + "num_tokens": 2681411.0, + "reward": 1.133333444595337, + "reward_std": 0.051639772951602936, + "rewards/correctness_reward_func/mean": 0.8333333134651184, + "rewards/correctness_reward_func/std": 0.07784988731145859, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 81 + }, + { + "completion_length": 2111.2500610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5036.0, + "completions/max_terminated_length": 5036.0, + "completions/mean_length": 2111.25, + "completions/mean_terminated_length": 2111.25, + "completions/min_length": 456.0, + "completions/min_terminated_length": 456.0, + "epoch": 0.027815468113975575, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.35042816400527954, + "kl": 0.0, + "learning_rate": 4.946514837819185e-07, + "loss": 0.0032, + "num_tokens": 2717276.0, + "reward": 1.0916666984558105, + "reward_std": 0.25535523891448975, + "rewards/correctness_reward_func/mean": 0.8166666030883789, + "rewards/correctness_reward_func/std": 0.27579087018966675, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.05838742107152939, + "step": 82 + }, + { + "completion_length": 2550.166748046875, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5000.0, + "completions/mean_length": 3099.25, + "completions/mean_terminated_length": 2782.0, + "completions/min_length": 1123.0, + "completions/min_terminated_length": 1123.0, + "epoch": 0.028154681139755767, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0620650053024292, + "kl": NaN, + "learning_rate": 4.944789510006901e-07, + "loss": -0.0267, + "num_tokens": 2759404.0, + "reward": 0.7916666865348816, + "reward_std": 0.4943132996559143, + "rewards/correctness_reward_func/mean": 0.5166666507720947, + "rewards/correctness_reward_func/std": 0.4628632962703705, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.08660253882408142, + "step": 83 + }, + { + "completion_length": 2253.5001220703125, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 4483.0, + "completions/mean_length": 2802.58349609375, + "completions/mean_terminated_length": 2458.36376953125, + "completions/min_length": 885.0, + "completions/min_terminated_length": 885.0, + "epoch": 0.028493894165535955, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.1895497888326645, + "kl": NaN, + "learning_rate": 4.943064182194616e-07, + "loss": -0.0107, + "num_tokens": 2801404.0, + "reward": 0.6750000715255737, + "reward_std": 0.06123724579811096, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.08660254627466202, + "step": 84 + }, + { + "completion_length": 2357.0833740234375, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 4887.0, + "completions/mean_length": 2906.166748046875, + "completions/mean_terminated_length": 2571.36376953125, + "completions/min_length": 1036.0, + "completions/min_terminated_length": 1036.0, + "epoch": 0.028833107191316147, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8943297863006592, + "kl": NaN, + "learning_rate": 4.941338854382333e-07, + "loss": -0.0042, + "num_tokens": 2841851.0, + "reward": 0.9291667342185974, + "reward_std": 0.4828321933746338, + "rewards/correctness_reward_func/mean": 0.6666666865348816, + "rewards/correctness_reward_func/std": 0.4119429290294647, + "rewards/format_reward_func/mean": 0.26250001788139343, + "rewards/format_reward_func/std": 0.09323723614215851, + "step": 85 + }, + { + "completion_length": 2335.7500610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4581.0, + "completions/max_terminated_length": 4581.0, + "completions/mean_length": 2335.75, + "completions/mean_terminated_length": 2335.75, + "completions/min_length": 1136.0, + "completions/min_terminated_length": 1136.0, + "epoch": 0.029172320217096336, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6140693426132202, + "kl": 0.0, + "learning_rate": 4.939613526570047e-07, + "loss": -0.0082, + "num_tokens": 2883254.0, + "reward": 1.1041667461395264, + "reward_std": 0.2685222029685974, + "rewards/correctness_reward_func/mean": 0.8166666626930237, + "rewards/correctness_reward_func/std": 0.3857303261756897, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 86 + }, + { + "completion_length": 1397.6666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3341.0, + "completions/max_terminated_length": 3341.0, + "completions/mean_length": 1397.666748046875, + "completions/mean_terminated_length": 1397.666748046875, + "completions/min_length": 530.0, + "completions/min_terminated_length": 530.0, + "epoch": 0.029511533242876527, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.753872275352478, + "kl": 0.0, + "learning_rate": 4.937888198757764e-07, + "loss": -0.032, + "num_tokens": 2910766.0, + "reward": 0.9625000357627869, + "reward_std": 0.3697127103805542, + "rewards/correctness_reward_func/mean": 0.699999988079071, + "rewards/correctness_reward_func/std": 0.43064433336257935, + "rewards/format_reward_func/mean": 0.26250001788139343, + "rewards/format_reward_func/std": 0.06784005463123322, + "step": 87 + }, + { + "completion_length": 1930.6666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3123.0, + "completions/max_terminated_length": 3123.0, + "completions/mean_length": 1930.666748046875, + "completions/mean_terminated_length": 1930.666748046875, + "completions/min_length": 634.0, + "completions/min_terminated_length": 634.0, + "epoch": 0.029850746268656716, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6493222713470459, + "kl": 0.0, + "learning_rate": 4.93616287094548e-07, + "loss": -0.0047, + "num_tokens": 2943504.0, + "reward": 0.8291666507720947, + "reward_std": 0.18534879386425018, + "rewards/correctness_reward_func/mean": 0.5666666626930237, + "rewards/correctness_reward_func/std": 0.5033223032951355, + "rewards/format_reward_func/mean": 0.26250001788139343, + "rewards/format_reward_func/std": 0.06784005463123322, + "step": 88 + }, + { + "completion_length": 1324.7500610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3149.0, + "completions/max_terminated_length": 3149.0, + "completions/mean_length": 1324.75, + "completions/mean_terminated_length": 1324.75, + "completions/min_length": 534.0, + "completions/min_terminated_length": 534.0, + "epoch": 0.030189959294436908, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.42489466071128845, + "kl": 0.0, + "learning_rate": 4.934437543133195e-07, + "loss": -0.0001, + "num_tokens": 2974185.0, + "reward": 0.6208333969116211, + "reward_std": 0.19391795992851257, + "rewards/correctness_reward_func/mean": 0.3333333432674408, + "rewards/correctness_reward_func/std": 0.4119429290294647, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 89 + }, + { + "completion_length": 1459.0833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2928.0, + "completions/max_terminated_length": 2928.0, + "completions/mean_length": 1459.0833740234375, + "completions/mean_terminated_length": 1459.0833740234375, + "completions/min_length": 744.0, + "completions/min_terminated_length": 744.0, + "epoch": 0.030529172320217096, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0769336149096489, + "kl": 0.0, + "learning_rate": 4.932712215320911e-07, + "loss": -0.0001, + "num_tokens": 3005680.0, + "reward": 1.1166667938232422, + "reward_std": 0.040824826806783676, + "rewards/correctness_reward_func/mean": 0.8166666626930237, + "rewards/correctness_reward_func/std": 0.05773502215743065, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 90 + }, + { + "completion_length": 763.0833435058594, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1107.0, + "completions/max_terminated_length": 1107.0, + "completions/mean_length": 763.0833740234375, + "completions/mean_terminated_length": 763.0833740234375, + "completions/min_length": 507.0, + "completions/min_terminated_length": 507.0, + "epoch": 0.030868385345997285, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07657662034034729, + "kl": 0.0, + "learning_rate": 4.930986887508626e-07, + "loss": -0.0011, + "num_tokens": 3029669.0, + "reward": 0.7041667699813843, + "reward_std": 0.07144345343112946, + "rewards/correctness_reward_func/mean": 0.4166666567325592, + "rewards/correctness_reward_func/std": 0.4386619031429291, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 91 + }, + { + "completion_length": 777.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1282.0, + "completions/max_terminated_length": 1282.0, + "completions/mean_length": 777.0, + "completions/mean_terminated_length": 777.0, + "completions/min_length": 375.0, + "completions/min_terminated_length": 375.0, + "epoch": 0.031207598371777476, + "frac_reward_zero_std": 1.0, + "grad_norm": 8.553340791195296e-08, + "kl": 0.0, + "learning_rate": 4.929261559696342e-07, + "loss": 0.0, + "num_tokens": 3051119.0, + "reward": 1.2000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 92 + }, + { + "completion_length": 2085.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5398.0, + "completions/max_terminated_length": 5398.0, + "completions/mean_length": 2085.75, + "completions/mean_terminated_length": 2085.75, + "completions/min_length": 457.0, + "completions/min_terminated_length": 457.0, + "epoch": 0.03154681139755767, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.419355571269989, + "kl": 0.0, + "learning_rate": 4.927536231884058e-07, + "loss": 0.0014, + "num_tokens": 3089666.0, + "reward": 0.8958333730697632, + "reward_std": 0.298921674489975, + "rewards/correctness_reward_func/mean": 0.6333333253860474, + "rewards/correctness_reward_func/std": 0.3892494738101959, + "rewards/format_reward_func/mean": 0.26250001788139343, + "rewards/format_reward_func/std": 0.06784005463123322, + "step": 93 + }, + { + "completion_length": 2719.3333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4738.0, + "completions/max_terminated_length": 4738.0, + "completions/mean_length": 2719.33349609375, + "completions/mean_terminated_length": 2719.33349609375, + "completions/min_length": 763.0, + "completions/min_terminated_length": 763.0, + "epoch": 0.03188602442333786, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.419344425201416, + "kl": 0.0, + "learning_rate": 4.925810904071773e-07, + "loss": 0.0041, + "num_tokens": 3131586.0, + "reward": 0.6083334684371948, + "reward_std": 0.1906316578388214, + "rewards/correctness_reward_func/mean": 0.3333333432674408, + "rewards/correctness_reward_func/std": 0.4119429290294647, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.05838742107152939, + "step": 94 + }, + { + "completion_length": 2132.2500610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5472.0, + "completions/max_terminated_length": 5472.0, + "completions/mean_length": 2132.25, + "completions/mean_terminated_length": 2132.25, + "completions/min_length": 753.0, + "completions/min_terminated_length": 753.0, + "epoch": 0.032225237449118045, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3389549255371094, + "kl": 0.0, + "learning_rate": 4.924085576259489e-07, + "loss": 0.0064, + "num_tokens": 3170469.0, + "reward": 1.0875000953674316, + "reward_std": 0.24555771052837372, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.2696799635887146, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 95 + }, + { + "completion_length": 1292.5000305175781, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 3358.0, + "completions/mean_length": 1841.5833740234375, + "completions/mean_terminated_length": 1410.0, + "completions/min_length": 431.0, + "completions/min_terminated_length": 431.0, + "epoch": 0.032564450474898234, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10573876649141312, + "kl": NaN, + "learning_rate": 4.922360248447205e-07, + "loss": -0.0043, + "num_tokens": 3200067.0, + "reward": 0.6375000476837158, + "reward_std": 0.06934845447540283, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.23750001192092896, + "rewards/format_reward_func/std": 0.07723929733037949, + "step": 96 + }, + { + "completion_length": 1452.0000610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2044.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1452.0, + "completions/mean_terminated_length": 1452.0, + "completions/min_length": 820.0, + "completions/min_terminated_length": 820.0, + "epoch": 0.03290366350067843, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.44858041405677795, + "kl": 0.0, + "learning_rate": 4.92063492063492e-07, + "loss": 0.0015, + "num_tokens": 3228213.0, + "reward": 0.833333432674408, + "reward_std": 0.20655912160873413, + "rewards/correctness_reward_func/mean": 0.5333333611488342, + "rewards/correctness_reward_func/std": 0.3938928246498108, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 97 + }, + { + "completion_length": 2012.0000610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6447.0, + "completions/max_terminated_length": 6447.0, + "completions/mean_length": 2012.0, + "completions/mean_terminated_length": 2012.0, + "completions/min_length": 379.0, + "completions/min_terminated_length": 379.0, + "epoch": 0.03324287652645862, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.17230212688446045, + "kl": 0.0, + "learning_rate": 4.918909592822636e-07, + "loss": 0.0018, + "num_tokens": 3265227.0, + "reward": 1.120833396911621, + "reward_std": 0.10867881774902344, + "rewards/correctness_reward_func/mean": 0.8833333849906921, + "rewards/correctness_reward_func/std": 0.10298572480678558, + "rewards/format_reward_func/mean": 0.23750001192092896, + "rewards/format_reward_func/std": 0.07723929733037949, + "step": 98 + }, + { + "completion_length": 2578.166748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4870.0, + "completions/max_terminated_length": 4870.0, + "completions/mean_length": 2578.166748046875, + "completions/mean_terminated_length": 2578.166748046875, + "completions/min_length": 967.0, + "completions/min_terminated_length": 967.0, + "epoch": 0.033582089552238806, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.5569886565208435, + "kl": 0.0, + "learning_rate": 4.917184265010351e-07, + "loss": -0.0017, + "num_tokens": 3313511.0, + "reward": 0.9541667699813843, + "reward_std": 0.22716552019119263, + "rewards/correctness_reward_func/mean": 0.6666666865348816, + "rewards/correctness_reward_func/std": 0.31139957904815674, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 99 + }, + { + "completion_length": 715.3333435058594, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1045.0, + "completions/max_terminated_length": 1045.0, + "completions/mean_length": 715.3333740234375, + "completions/mean_terminated_length": 715.3333740234375, + "completions/min_length": 496.0, + "completions/min_terminated_length": 496.0, + "epoch": 0.033921302578018994, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.06762003153562546, + "kl": 0.0, + "learning_rate": 4.915458937198068e-07, + "loss": -0.0004, + "num_tokens": 3333903.0, + "reward": 1.1625001430511475, + "reward_std": 0.04107918590307236, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.26250001788139343, + "rewards/format_reward_func/std": 0.06784005463123322, + "step": 100 + }, + { + "completion_length": 2714.916748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6070.0, + "completions/max_terminated_length": 6070.0, + "completions/mean_length": 2714.916748046875, + "completions/mean_terminated_length": 2714.916748046875, + "completions/min_length": 710.0, + "completions/min_terminated_length": 710.0, + "epoch": 0.03426051560379918, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.32408711314201355, + "kl": 0.0, + "learning_rate": 4.913733609385783e-07, + "loss": -0.0026, + "num_tokens": 3380318.0, + "reward": 0.7000000476837158, + "reward_std": 0.09350207448005676, + "rewards/correctness_reward_func/mean": 0.45000001788139343, + "rewards/correctness_reward_func/std": 0.47577688097953796, + "rewards/format_reward_func/mean": 0.25, + "rewards/format_reward_func/std": 0.07385490089654922, + "step": 101 + }, + { + "completion_length": 669.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1200.0, + "completions/max_terminated_length": 1200.0, + "completions/mean_length": 669.0, + "completions/mean_terminated_length": 669.0, + "completions/min_length": 477.0, + "completions/min_terminated_length": 477.0, + "epoch": 0.03459972862957938, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.05602804571390152, + "kl": 0.0, + "learning_rate": 4.912008281573499e-07, + "loss": -0.0003, + "num_tokens": 3398468.0, + "reward": 1.0750000476837158, + "reward_std": 0.038729824125766754, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.05838742107152939, + "step": 102 + }, + { + "completion_length": 2353.166748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6165.0, + "completions/max_terminated_length": 6165.0, + "completions/mean_length": 2353.166748046875, + "completions/mean_terminated_length": 2353.166748046875, + "completions/min_length": 870.0, + "completions/min_terminated_length": 870.0, + "epoch": 0.034938941655359566, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.4930708408355713, + "kl": 0.0, + "learning_rate": 4.910282953761215e-07, + "loss": 0.043, + "num_tokens": 3432304.0, + "reward": 1.133333444595337, + "reward_std": 0.1632993221282959, + "rewards/correctness_reward_func/mean": 0.8333333134651184, + "rewards/correctness_reward_func/std": 0.2806917726993561, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 103 + }, + { + "completion_length": 2527.5001220703125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4175.0, + "completions/max_terminated_length": 4175.0, + "completions/mean_length": 2527.5, + "completions/mean_terminated_length": 2527.5, + "completions/min_length": 1459.0, + "completions/min_terminated_length": 1459.0, + "epoch": 0.035278154681139755, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.2505856454372406, + "kl": 0.0, + "learning_rate": 4.90855762594893e-07, + "loss": 0.0011, + "num_tokens": 3473278.0, + "reward": 1.1666667461395264, + "reward_std": 0.051639750599861145, + "rewards/correctness_reward_func/mean": 0.8666666150093079, + "rewards/correctness_reward_func/std": 0.0984731912612915, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 104 + }, + { + "completion_length": 1665.3333740234375, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5170.0, + "completions/mean_length": 2214.416748046875, + "completions/mean_terminated_length": 1816.727294921875, + "completions/min_length": 827.0, + "completions/min_terminated_length": 827.0, + "epoch": 0.03561736770691994, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7407315969467163, + "kl": NaN, + "learning_rate": 4.906832298136646e-07, + "loss": -0.0215, + "num_tokens": 3503390.0, + "reward": 0.783333420753479, + "reward_std": 0.4725285470485687, + "rewards/correctness_reward_func/mean": 0.5333333611488342, + "rewards/correctness_reward_func/std": 0.3938928246498108, + "rewards/format_reward_func/mean": 0.25, + "rewards/format_reward_func/std": 0.09770084172487259, + "step": 105 + }, + { + "completion_length": 3103.2501220703125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4550.0, + "completions/max_terminated_length": 4550.0, + "completions/mean_length": 3103.25, + "completions/mean_terminated_length": 3103.25, + "completions/min_length": 1426.0, + "completions/min_terminated_length": 1426.0, + "epoch": 0.03595658073270014, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6687279343605042, + "kl": 0.0, + "learning_rate": 4.905106970324361e-07, + "loss": 0.0069, + "num_tokens": 3552125.0, + "reward": 0.8375000953674316, + "reward_std": 0.2458404153585434, + "rewards/correctness_reward_func/mean": 0.550000011920929, + "rewards/correctness_reward_func/std": 0.4100997745990753, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 106 + }, + { + "completion_length": 1616.416748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3833.0, + "completions/max_terminated_length": 3833.0, + "completions/mean_length": 1616.416748046875, + "completions/mean_terminated_length": 1616.416748046875, + "completions/min_length": 682.0, + "completions/min_terminated_length": 682.0, + "epoch": 0.03629579375848033, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5577266812324524, + "kl": 0.0, + "learning_rate": 4.903381642512077e-07, + "loss": 0.0133, + "num_tokens": 3581374.0, + "reward": 1.0333333015441895, + "reward_std": 0.25819891691207886, + "rewards/correctness_reward_func/mean": 0.7333333492279053, + "rewards/correctness_reward_func/std": 0.35505014657974243, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 107 + }, + { + "completion_length": 1808.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3992.0, + "completions/max_terminated_length": 3992.0, + "completions/mean_length": 1808.25, + "completions/mean_terminated_length": 1808.25, + "completions/min_length": 550.0, + "completions/min_terminated_length": 550.0, + "epoch": 0.036635006784260515, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0762360617518425, + "kl": 0.0, + "learning_rate": 4.901656314699793e-07, + "loss": 0.0011, + "num_tokens": 3614665.0, + "reward": 1.254166603088379, + "reward_std": 0.05103101581335068, + "rewards/correctness_reward_func/mean": 0.9666666984558105, + "rewards/correctness_reward_func/std": 0.07784988731145859, + "rewards/format_reward_func/mean": 0.2875000238418579, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 108 + }, + { + "completion_length": 1865.8333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3235.0, + "completions/max_terminated_length": 3235.0, + "completions/mean_length": 1865.8333740234375, + "completions/mean_terminated_length": 1865.8333740234375, + "completions/min_length": 726.0, + "completions/min_terminated_length": 726.0, + "epoch": 0.036974219810040704, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.45066243410110474, + "kl": 0.0, + "learning_rate": 4.899930986887508e-07, + "loss": 0.0067, + "num_tokens": 3649031.0, + "reward": 1.0333335399627686, + "reward_std": 0.1632993221282959, + "rewards/correctness_reward_func/mean": 0.7333333492279053, + "rewards/correctness_reward_func/std": 0.23094011843204498, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 109 + }, + { + "completion_length": 1545.3333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3815.0, + "completions/max_terminated_length": 3815.0, + "completions/mean_length": 1545.3333740234375, + "completions/mean_terminated_length": 1545.3333740234375, + "completions/min_length": 562.0, + "completions/min_terminated_length": 562.0, + "epoch": 0.03731343283582089, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9093514680862427, + "kl": 0.0, + "learning_rate": 4.898205659075224e-07, + "loss": 0.0008, + "num_tokens": 3681831.0, + "reward": 1.0250000953674316, + "reward_std": 0.3382870554924011, + "rewards/correctness_reward_func/mean": 0.75, + "rewards/correctness_reward_func/std": 0.36306774616241455, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.05838742107152939, + "step": 110 + }, + { + "completion_length": 2410.916748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5848.0, + "completions/max_terminated_length": 5848.0, + "completions/mean_length": 2410.916748046875, + "completions/mean_terminated_length": 2410.916748046875, + "completions/min_length": 463.0, + "completions/min_terminated_length": 463.0, + "epoch": 0.03765264586160109, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.689710795879364, + "kl": 0.0, + "learning_rate": 4.89648033126294e-07, + "loss": 0.0, + "num_tokens": 3725348.0, + "reward": 0.9833333492279053, + "reward_std": 0.3129711151123047, + "rewards/correctness_reward_func/mean": 0.6833333373069763, + "rewards/correctness_reward_func/std": 0.4217568039894104, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 111 + }, + { + "completion_length": 1068.6666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2515.0, + "completions/max_terminated_length": 2515.0, + "completions/mean_length": 1068.666748046875, + "completions/mean_terminated_length": 1068.666748046875, + "completions/min_length": 397.0, + "completions/min_terminated_length": 397.0, + "epoch": 0.037991858887381276, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.5011056661605835, + "kl": 0.0, + "learning_rate": 4.894755003450655e-07, + "loss": 0.0033, + "num_tokens": 3749494.0, + "reward": 0.9541667699813843, + "reward_std": 0.22716552019119263, + "rewards/correctness_reward_func/mean": 0.6666666865348816, + "rewards/correctness_reward_func/std": 0.31139957904815674, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 112 + }, + { + "completion_length": 1666.5833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4317.0, + "completions/max_terminated_length": 4317.0, + "completions/mean_length": 1666.5833740234375, + "completions/mean_terminated_length": 1666.5833740234375, + "completions/min_length": 685.0, + "completions/min_terminated_length": 685.0, + "epoch": 0.038331071913161464, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.4090554416179657, + "kl": 0.0, + "learning_rate": 4.893029675638371e-07, + "loss": 0.0057, + "num_tokens": 3779723.0, + "reward": 0.6333333253860474, + "reward_std": 0.1632993221282959, + "rewards/correctness_reward_func/mean": 0.3333333432674408, + "rewards/correctness_reward_func/std": 0.4119429290294647, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 113 + }, + { + "completion_length": 2164.5000610351562, + "completions/clipped_ratio": 0.33333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5738.0, + "completions/mean_length": 4360.83349609375, + "completions/mean_terminated_length": 3246.75, + "completions/min_length": 1838.0, + "completions/min_terminated_length": 1838.0, + "epoch": 0.03867028493894165, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8448308706283569, + "kl": NaN, + "learning_rate": 4.891304347826087e-07, + "loss": -0.0637, + "num_tokens": 3821321.0, + "reward": 0.3499999940395355, + "reward_std": 0.4247293472290039, + "rewards/correctness_reward_func/mean": 0.14999999105930328, + "rewards/correctness_reward_func/std": 0.35290998220443726, + "rewards/format_reward_func/mean": 0.20000000298023224, + "rewards/format_reward_func/std": 0.14770980179309845, + "step": 114 + }, + { + "completion_length": 1456.2500610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2937.0, + "completions/max_terminated_length": 2937.0, + "completions/mean_length": 1456.25, + "completions/mean_terminated_length": 1456.25, + "completions/min_length": 457.0, + "completions/min_terminated_length": 457.0, + "epoch": 0.03900949796472185, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.050372909754514694, + "kl": 0.0, + "learning_rate": 4.889579020013803e-07, + "loss": 0.0004, + "num_tokens": 3846560.0, + "reward": 0.75, + "reward_std": 0.03872983902692795, + "rewards/correctness_reward_func/mean": 0.5, + "rewards/correctness_reward_func/std": 0.5222329497337341, + "rewards/format_reward_func/mean": 0.25, + "rewards/format_reward_func/std": 0.07385490089654922, + "step": 115 + }, + { + "completion_length": 3040.0001220703125, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 4961.0, + "completions/mean_length": 3589.08349609375, + "completions/mean_terminated_length": 3316.36376953125, + "completions/min_length": 1877.0, + "completions/min_terminated_length": 1877.0, + "epoch": 0.03934871099050204, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7781913876533508, + "kl": NaN, + "learning_rate": 4.887853692201518e-07, + "loss": -0.038, + "num_tokens": 3899264.0, + "reward": 1.0250000953674316, + "reward_std": 0.26536139845848083, + "rewards/correctness_reward_func/mean": 0.75, + "rewards/correctness_reward_func/std": 0.24308621883392334, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.08660253882408142, + "step": 116 + }, + { + "completion_length": 3644.2501220703125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6581.0, + "completions/max_terminated_length": 6581.0, + "completions/mean_length": 3644.25, + "completions/mean_terminated_length": 3644.25, + "completions/min_length": 1230.0, + "completions/min_terminated_length": 1230.0, + "epoch": 0.039687924016282225, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.624457597732544, + "kl": 0.0, + "learning_rate": 4.886128364389234e-07, + "loss": 0.0081, + "num_tokens": 3961217.0, + "reward": 1.1375000476837158, + "reward_std": 0.20600365102291107, + "rewards/correctness_reward_func/mean": 0.8499999642372131, + "rewards/correctness_reward_func/std": 0.28444525599479675, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 117 + }, + { + "completion_length": 2229.5001220703125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3979.0, + "completions/max_terminated_length": 3979.0, + "completions/mean_length": 2229.5, + "completions/mean_terminated_length": 2229.5, + "completions/min_length": 1003.0, + "completions/min_terminated_length": 1003.0, + "epoch": 0.04002713704206241, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 4.88440303657695e-07, + "loss": 0.0, + "num_tokens": 4004513.0, + "reward": 0.30000001192092896, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.0, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 118 + }, + { + "completion_length": 2228.416748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3815.0, + "completions/max_terminated_length": 3815.0, + "completions/mean_length": 2228.416748046875, + "completions/mean_terminated_length": 2228.416748046875, + "completions/min_length": 864.0, + "completions/min_terminated_length": 864.0, + "epoch": 0.0403663500678426, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7541438937187195, + "kl": 0.0, + "learning_rate": 4.882677708764665e-07, + "loss": -0.0092, + "num_tokens": 4043008.0, + "reward": 0.7000000476837158, + "reward_std": 0.41311824321746826, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 119 + }, + { + "completion_length": 2333.8333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5397.0, + "completions/max_terminated_length": 5397.0, + "completions/mean_length": 2333.83349609375, + "completions/mean_terminated_length": 2333.83349609375, + "completions/min_length": 489.0, + "completions/min_terminated_length": 489.0, + "epoch": 0.0407055630936228, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7183325290679932, + "kl": 0.0, + "learning_rate": 4.880952380952381e-07, + "loss": 0.0125, + "num_tokens": 4085732.0, + "reward": 0.970833420753479, + "reward_std": 0.3699861466884613, + "rewards/correctness_reward_func/mean": 0.6833333969116211, + "rewards/correctness_reward_func/std": 0.32427075505256653, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 120 + }, + { + "completion_length": 2344.83349609375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5423.0, + "completions/max_terminated_length": 5423.0, + "completions/mean_length": 2344.83349609375, + "completions/mean_terminated_length": 2344.83349609375, + "completions/min_length": 556.0, + "completions/min_terminated_length": 556.0, + "epoch": 0.041044776119402986, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.18063339591026306, + "kl": 0.0, + "learning_rate": 4.879227053140096e-07, + "loss": 0.0024, + "num_tokens": 4126794.0, + "reward": 1.2291667461395264, + "reward_std": 0.0927189290523529, + "rewards/correctness_reward_func/mean": 0.9666666984558105, + "rewards/correctness_reward_func/std": 0.07784988731145859, + "rewards/format_reward_func/mean": 0.26250001788139343, + "rewards/format_reward_func/std": 0.06784005463123322, + "step": 121 + }, + { + "completion_length": 2123.166748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3868.0, + "completions/max_terminated_length": 3868.0, + "completions/mean_length": 2123.166748046875, + "completions/mean_terminated_length": 2123.166748046875, + "completions/min_length": 1150.0, + "completions/min_terminated_length": 1150.0, + "epoch": 0.041383989145183174, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.7253784537315369, + "kl": 0.0, + "learning_rate": 4.877501725327812e-07, + "loss": 0.0166, + "num_tokens": 4164896.0, + "reward": 1.120833396911621, + "reward_std": 0.19391795992851257, + "rewards/correctness_reward_func/mean": 0.8333333134651184, + "rewards/correctness_reward_func/std": 0.2806917726993561, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 122 + }, + { + "completion_length": 1970.4166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4138.0, + "completions/max_terminated_length": 4138.0, + "completions/mean_length": 1970.416748046875, + "completions/mean_terminated_length": 1970.416748046875, + "completions/min_length": 1346.0, + "completions/min_terminated_length": 1346.0, + "epoch": 0.04172320217096336, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.3625893294811249, + "kl": 0.0, + "learning_rate": 4.875776397515527e-07, + "loss": 0.005, + "num_tokens": 4200325.0, + "reward": 1.0208334922790527, + "reward_std": 0.19391795992851257, + "rewards/correctness_reward_func/mean": 0.7333333492279053, + "rewards/correctness_reward_func/std": 0.23094011843204498, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 123 + }, + { + "completion_length": 2338.666748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6154.0, + "completions/max_terminated_length": 6154.0, + "completions/mean_length": 2338.666748046875, + "completions/mean_terminated_length": 2338.666748046875, + "completions/min_length": 917.0, + "completions/min_terminated_length": 917.0, + "epoch": 0.04206241519674356, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6652452349662781, + "kl": 0.0, + "learning_rate": 4.874051069703243e-07, + "loss": 0.0208, + "num_tokens": 4235175.0, + "reward": 0.46250003576278687, + "reward_std": 0.26016825437545776, + "rewards/correctness_reward_func/mean": 0.20000000298023224, + "rewards/correctness_reward_func/std": 0.36181363463401794, + "rewards/format_reward_func/mean": 0.26250001788139343, + "rewards/format_reward_func/std": 0.06784005463123322, + "step": 124 + }, + { + "completion_length": 1641.7500610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3666.0, + "completions/max_terminated_length": 3666.0, + "completions/mean_length": 1641.75, + "completions/mean_terminated_length": 1641.75, + "completions/min_length": 552.0, + "completions/min_terminated_length": 552.0, + "epoch": 0.042401628222523746, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5879623889923096, + "kl": 0.0, + "learning_rate": 4.872325741890959e-07, + "loss": 0.0249, + "num_tokens": 4266114.0, + "reward": 1.1041667461395264, + "reward_std": 0.23474274575710297, + "rewards/correctness_reward_func/mean": 0.8166666030883789, + "rewards/correctness_reward_func/std": 0.27579087018966675, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 125 + }, + { + "completion_length": 2013.8333740234375, + "completions/clipped_ratio": 0.33333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 6537.0, + "completions/mean_length": 4210.1669921875, + "completions/mean_terminated_length": 3020.75, + "completions/min_length": 1015.0, + "completions/min_terminated_length": 1015.0, + "epoch": 0.042740841248303935, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6589382290840149, + "kl": NaN, + "learning_rate": 4.870600414078675e-07, + "loss": -0.093, + "num_tokens": 4304308.0, + "reward": 0.7333334684371948, + "reward_std": 0.2840188145637512, + "rewards/correctness_reward_func/mean": 0.5333333611488342, + "rewards/correctness_reward_func/std": 0.3938928246498108, + "rewards/format_reward_func/mean": 0.20000000298023224, + "rewards/format_reward_func/std": 0.14770980179309845, + "step": 126 + }, + { + "completion_length": 1316.916748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2623.0, + "completions/max_terminated_length": 2623.0, + "completions/mean_length": 1316.916748046875, + "completions/mean_terminated_length": 1316.916748046875, + "completions/min_length": 551.0, + "completions/min_terminated_length": 551.0, + "epoch": 0.04308005427408412, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.490523099899292, + "kl": 0.0, + "learning_rate": 4.86887508626639e-07, + "loss": -0.0003, + "num_tokens": 4331583.0, + "reward": 1.1708333492279053, + "reward_std": 0.2863824963569641, + "rewards/correctness_reward_func/mean": 0.8833333849906921, + "rewards/correctness_reward_func/std": 0.28867512941360474, + "rewards/format_reward_func/mean": 0.2875000238418579, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 127 + }, + { + "completion_length": 956.1666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1425.0, + "completions/max_terminated_length": 1425.0, + "completions/mean_length": 956.1666870117188, + "completions/mean_terminated_length": 956.1666870117188, + "completions/min_length": 601.0, + "completions/min_terminated_length": 601.0, + "epoch": 0.04341926729986431, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.34587809443473816, + "kl": 0.0, + "learning_rate": 4.867149758454106e-07, + "loss": -0.0019, + "num_tokens": 4355141.0, + "reward": 0.8625000715255737, + "reward_std": 0.26016825437545776, + "rewards/correctness_reward_func/mean": 0.6000000238418579, + "rewards/correctness_reward_func/std": 0.36181363463401794, + "rewards/format_reward_func/mean": 0.26250001788139343, + "rewards/format_reward_func/std": 0.06784005463123322, + "step": 128 + }, + { + "completion_length": 831.0833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1665.0, + "completions/max_terminated_length": 1665.0, + "completions/mean_length": 831.0833740234375, + "completions/mean_terminated_length": 831.0833740234375, + "completions/min_length": 332.0, + "completions/min_terminated_length": 332.0, + "epoch": 0.04375848032564451, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.08698862046003342, + "kl": 0.0, + "learning_rate": 4.865424430641822e-07, + "loss": -0.0014, + "num_tokens": 4378542.0, + "reward": 0.7166666984558105, + "reward_std": 0.040824826806783676, + "rewards/correctness_reward_func/mean": 0.4166666567325592, + "rewards/correctness_reward_func/std": 0.4386619031429291, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 129 + }, + { + "completion_length": 2297.7500610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5522.0, + "completions/max_terminated_length": 5522.0, + "completions/mean_length": 2297.75, + "completions/mean_terminated_length": 2297.75, + "completions/min_length": 1058.0, + "completions/min_terminated_length": 1058.0, + "epoch": 0.044097693351424695, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.546366810798645, + "kl": 0.0, + "learning_rate": 4.863699102829538e-07, + "loss": -0.0185, + "num_tokens": 4411467.0, + "reward": 1.1041667461395264, + "reward_std": 0.23474276065826416, + "rewards/correctness_reward_func/mean": 0.8166666030883789, + "rewards/correctness_reward_func/std": 0.27579087018966675, + "rewards/format_reward_func/mean": 0.2875000238418579, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 130 + }, + { + "completion_length": 2667.3333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4986.0, + "completions/max_terminated_length": 4986.0, + "completions/mean_length": 2667.33349609375, + "completions/mean_terminated_length": 2667.33349609375, + "completions/min_length": 1077.0, + "completions/min_terminated_length": 1077.0, + "epoch": 0.044436906377204884, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.11569435894489288, + "kl": 0.0, + "learning_rate": 4.861973775017253e-07, + "loss": -0.0001, + "num_tokens": 4458631.0, + "reward": 1.1500000953674316, + "reward_std": 0.05477222427725792, + "rewards/correctness_reward_func/mean": 0.8499999642372131, + "rewards/correctness_reward_func/std": 0.09045340120792389, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 131 + }, + { + "completion_length": 1830.0000610351562, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 3971.0, + "completions/mean_length": 2379.08349609375, + "completions/mean_terminated_length": 1996.3636474609375, + "completions/min_length": 839.0, + "completions/min_terminated_length": 839.0, + "epoch": 0.04477611940298507, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2605733573436737, + "kl": NaN, + "learning_rate": 4.860248447204969e-07, + "loss": -0.0351, + "num_tokens": 4493173.0, + "reward": 1.0250000953674316, + "reward_std": 0.26536139845848083, + "rewards/correctness_reward_func/mean": 0.75, + "rewards/correctness_reward_func/std": 0.24308621883392334, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.08660254627466202, + "step": 132 + }, + { + "completion_length": 2761.5001220703125, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5889.0, + "completions/mean_length": 3310.58349609375, + "completions/mean_terminated_length": 3012.545654296875, + "completions/min_length": 1545.0, + "completions/min_terminated_length": 1545.0, + "epoch": 0.04511533242876527, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6202500462532043, + "kl": NaN, + "learning_rate": 4.858523119392685e-07, + "loss": -0.0471, + "num_tokens": 4536811.0, + "reward": 0.6458333730697632, + "reward_std": 0.28442299365997314, + "rewards/correctness_reward_func/mean": 0.38333332538604736, + "rewards/correctness_reward_func/std": 0.4783177673816681, + "rewards/format_reward_func/mean": 0.26250001788139343, + "rewards/format_reward_func/std": 0.09323723614215851, + "step": 133 + }, + { + "completion_length": 1869.6666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3734.0, + "completions/max_terminated_length": 3734.0, + "completions/mean_length": 1869.666748046875, + "completions/mean_terminated_length": 1869.666748046875, + "completions/min_length": 1113.0, + "completions/min_terminated_length": 1113.0, + "epoch": 0.045454545454545456, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.1787030729237813e-07, + "kl": 0.0, + "learning_rate": 4.8567977915804e-07, + "loss": 0.0, + "num_tokens": 4570359.0, + "reward": 0.7000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 134 + }, + { + "completion_length": 1182.6666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2124.0, + "completions/max_terminated_length": 2124.0, + "completions/mean_length": 1182.666748046875, + "completions/mean_terminated_length": 1182.666748046875, + "completions/min_length": 483.0, + "completions/min_terminated_length": 483.0, + "epoch": 0.045793758480325644, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0750177651643753, + "kl": 0.0, + "learning_rate": 4.855072463768116e-07, + "loss": 0.001, + "num_tokens": 4593809.0, + "reward": 1.133333444595337, + "reward_std": 0.051639772951602936, + "rewards/correctness_reward_func/mean": 0.8333333134651184, + "rewards/correctness_reward_func/std": 0.07784988731145859, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 135 + }, + { + "completion_length": 1200.5000610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5137.0, + "completions/max_terminated_length": 5137.0, + "completions/mean_length": 1200.5, + "completions/mean_terminated_length": 1200.5, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "epoch": 0.04613297150610583, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.5664093494415283, + "kl": 0.0, + "learning_rate": 4.853347135955831e-07, + "loss": 0.0321, + "num_tokens": 4620119.0, + "reward": 1.1666667461395264, + "reward_std": 0.18618986010551453, + "rewards/correctness_reward_func/mean": 0.8666666150093079, + "rewards/correctness_reward_func/std": 0.287096232175827, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 136 + }, + { + "completion_length": 2200.0833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3977.0, + "completions/max_terminated_length": 3977.0, + "completions/mean_length": 2200.08349609375, + "completions/mean_terminated_length": 2200.08349609375, + "completions/min_length": 997.0, + "completions/min_terminated_length": 997.0, + "epoch": 0.04647218453188602, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.42219477891921997, + "kl": 0.0, + "learning_rate": 4.851621808143547e-07, + "loss": -0.0225, + "num_tokens": 4662480.0, + "reward": 1.0375001430511475, + "reward_std": 0.20600365102291107, + "rewards/correctness_reward_func/mean": 0.75, + "rewards/correctness_reward_func/std": 0.24308621883392334, + "rewards/format_reward_func/mean": 0.2875000238418579, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 137 + }, + { + "completion_length": 1934.75, + "completions/clipped_ratio": 0.41666666666666663, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5036.0, + "completions/mean_length": 4680.1669921875, + "completions/mean_terminated_length": 3316.71435546875, + "completions/min_length": 2530.0, + "completions/min_terminated_length": 2530.0, + "epoch": 0.046811397557666216, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.17448575794696808, + "kl": NaN, + "learning_rate": 4.849896480331262e-07, + "loss": -0.0143, + "num_tokens": 4698939.0, + "reward": 0.5875000953674316, + "reward_std": 0.06274950504302979, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.1875, + "rewards/format_reward_func/std": 0.14479610323905945, + "step": 138 + }, + { + "completion_length": 1352.3333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2818.0, + "completions/max_terminated_length": 2818.0, + "completions/mean_length": 1352.3333740234375, + "completions/mean_terminated_length": 1352.3333740234375, + "completions/min_length": 668.0, + "completions/min_terminated_length": 668.0, + "epoch": 0.047150610583446405, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.29719895124435425, + "kl": 0.0, + "learning_rate": 4.848171152518978e-07, + "loss": -0.0035, + "num_tokens": 4724791.0, + "reward": 1.1375000476837158, + "reward_std": 0.20600365102291107, + "rewards/correctness_reward_func/mean": 0.8499999642372131, + "rewards/correctness_reward_func/std": 0.28444522619247437, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 139 + }, + { + "completion_length": 2913.33349609375, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 4754.0, + "completions/mean_length": 3462.416748046875, + "completions/mean_terminated_length": 3178.181884765625, + "completions/min_length": 736.0, + "completions/min_terminated_length": 736.0, + "epoch": 0.04748982360922659, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.5751773118972778, + "kl": NaN, + "learning_rate": 4.846445824706694e-07, + "loss": -0.017, + "num_tokens": 4769939.0, + "reward": 0.4208333492279053, + "reward_std": 0.21818380057811737, + "rewards/correctness_reward_func/mean": 0.13333334028720856, + "rewards/correctness_reward_func/std": 0.31139957904815674, + "rewards/format_reward_func/mean": 0.2875000238418579, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 140 + }, + { + "completion_length": 1474.9166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2125.0, + "completions/max_terminated_length": 2125.0, + "completions/mean_length": 1474.916748046875, + "completions/mean_terminated_length": 1474.916748046875, + "completions/min_length": 960.0, + "completions/min_terminated_length": 960.0, + "epoch": 0.04782903663500678, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.07152793556451797, + "kl": 0.0, + "learning_rate": 4.84472049689441e-07, + "loss": 0.0009, + "num_tokens": 4803178.0, + "reward": 1.1875, + "reward_std": 0.03061862848699093, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 141 + }, + { + "completion_length": 1793.9166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2506.0, + "completions/max_terminated_length": 2506.0, + "completions/mean_length": 1793.916748046875, + "completions/mean_terminated_length": 1793.916748046875, + "completions/min_length": 1249.0, + "completions/min_terminated_length": 1249.0, + "epoch": 0.04816824966078698, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1186007633805275, + "kl": 0.0, + "learning_rate": 4.842995169082126e-07, + "loss": -0.0018, + "num_tokens": 4837575.0, + "reward": 1.1041667461395264, + "reward_std": 0.07144343852996826, + "rewards/correctness_reward_func/mean": 0.8166666626930237, + "rewards/correctness_reward_func/std": 0.05773502215743065, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 142 + }, + { + "completion_length": 1622.2500610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4223.0, + "completions/max_terminated_length": 4223.0, + "completions/mean_length": 1622.25, + "completions/mean_terminated_length": 1622.25, + "completions/min_length": 823.0, + "completions/min_terminated_length": 823.0, + "epoch": 0.048507462686567165, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.09311472624540329, + "kl": 0.0, + "learning_rate": 4.841269841269841e-07, + "loss": -0.001, + "num_tokens": 4868670.0, + "reward": 1.183333396911621, + "reward_std": 0.040824808180332184, + "rewards/correctness_reward_func/mean": 0.8833333849906921, + "rewards/correctness_reward_func/std": 0.10298572480678558, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 143 + }, + { + "completion_length": 1624.75, + "completions/clipped_ratio": 0.16666666666666663, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 4910.0, + "completions/mean_length": 2722.916748046875, + "completions/mean_terminated_length": 1949.7000732421875, + "completions/min_length": 699.0, + "completions/min_terminated_length": 699.0, + "epoch": 0.048846675712347354, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25399917364120483, + "kl": NaN, + "learning_rate": 4.839544513457557e-07, + "loss": -0.0161, + "num_tokens": 4895865.0, + "reward": 0.6666667461395264, + "reward_std": 0.11828449368476868, + "rewards/correctness_reward_func/mean": 0.4166666567325592, + "rewards/correctness_reward_func/std": 0.4386619031429291, + "rewards/format_reward_func/mean": 0.2500000298023224, + "rewards/format_reward_func/std": 0.11677484214305878, + "step": 144 + }, + { + "completion_length": 1415.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3606.0, + "completions/max_terminated_length": 3606.0, + "completions/mean_length": 1415.75, + "completions/mean_terminated_length": 1415.75, + "completions/min_length": 489.0, + "completions/min_terminated_length": 489.0, + "epoch": 0.04918588873812754, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.07228909432888031, + "kl": 0.0, + "learning_rate": 4.837819185645272e-07, + "loss": -0.0003, + "num_tokens": 4924656.0, + "reward": 1.120833396911621, + "reward_std": 0.04005204886198044, + "rewards/correctness_reward_func/mean": 0.8333333134651184, + "rewards/correctness_reward_func/std": 0.07784988731145859, + "rewards/format_reward_func/mean": 0.2875000238418579, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 145 + }, + { + "completion_length": 549.4166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 626.0, + "completions/max_terminated_length": 626.0, + "completions/mean_length": 549.4166870117188, + "completions/mean_terminated_length": 549.4166870117188, + "completions/min_length": 376.0, + "completions/min_terminated_length": 376.0, + "epoch": 0.04952510176390773, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.055849168449640274, + "kl": 0.0, + "learning_rate": 4.836093857832988e-07, + "loss": 0.0002, + "num_tokens": 4948643.0, + "reward": 1.1541666984558105, + "reward_std": 0.06024051457643509, + "rewards/correctness_reward_func/mean": 0.9166666865348816, + "rewards/correctness_reward_func/std": 0.10298573225736618, + "rewards/format_reward_func/mean": 0.23750001192092896, + "rewards/format_reward_func/std": 0.07723929733037949, + "step": 146 + }, + { + "completion_length": 1790.916748046875, + "completions/clipped_ratio": 0.41666666666666663, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 6085.0, + "completions/mean_length": 4536.33349609375, + "completions/mean_terminated_length": 3070.14306640625, + "completions/min_length": 686.0, + "completions/min_terminated_length": 686.0, + "epoch": 0.049864314789687926, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2127513885498047, + "kl": NaN, + "learning_rate": 4.834368530020704e-07, + "loss": -0.0705, + "num_tokens": 4982428.0, + "reward": 0.6416667699813843, + "reward_std": 0.22453658282756805, + "rewards/correctness_reward_func/mean": 0.46666666865348816, + "rewards/correctness_reward_func/std": 0.4119429290294647, + "rewards/format_reward_func/mean": 0.17499999701976776, + "rewards/format_reward_func/std": 0.15447859466075897, + "step": 147 + }, + { + "completion_length": 2748.416748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3929.0, + "completions/max_terminated_length": 3929.0, + "completions/mean_length": 2748.416748046875, + "completions/mean_terminated_length": 2748.416748046875, + "completions/min_length": 971.0, + "completions/min_terminated_length": 971.0, + "epoch": 0.050203527815468114, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.5216585397720337, + "kl": 0.0, + "learning_rate": 4.83264320220842e-07, + "loss": -0.0151, + "num_tokens": 5029227.0, + "reward": 0.833333432674408, + "reward_std": 0.20655912160873413, + "rewards/correctness_reward_func/mean": 0.5333333611488342, + "rewards/correctness_reward_func/std": 0.3938928246498108, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 148 + }, + { + "completion_length": 1443.4166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2598.0, + "completions/max_terminated_length": 2598.0, + "completions/mean_length": 1443.416748046875, + "completions/mean_terminated_length": 1443.416748046875, + "completions/min_length": 442.0, + "completions/min_terminated_length": 442.0, + "epoch": 0.0505427408412483, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31681346893310547, + "kl": 0.0, + "learning_rate": 4.830917874396135e-07, + "loss": -0.0102, + "num_tokens": 5056202.0, + "reward": 0.6458333730697632, + "reward_std": 0.2625694274902344, + "rewards/correctness_reward_func/mean": 0.38333332538604736, + "rewards/correctness_reward_func/std": 0.4783177673816681, + "rewards/format_reward_func/mean": 0.26250001788139343, + "rewards/format_reward_func/std": 0.06784005463123322, + "step": 149 + }, + { + "completion_length": 2423.1666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5794.0, + "completions/max_terminated_length": 5794.0, + "completions/mean_length": 2423.166748046875, + "completions/mean_terminated_length": 2423.166748046875, + "completions/min_length": 541.0, + "completions/min_terminated_length": 541.0, + "epoch": 0.05088195386702849, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.12223686277866364, + "kl": 0.0, + "learning_rate": 4.829192546583851e-07, + "loss": -0.0021, + "num_tokens": 5094136.0, + "reward": 1.2333333492279053, + "reward_std": 0.05163976177573204, + "rewards/correctness_reward_func/mean": 0.9333333373069763, + "rewards/correctness_reward_func/std": 0.0984731912612915, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 150 + }, + { + "completion_length": 2325.6666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5329.0, + "completions/max_terminated_length": 5329.0, + "completions/mean_length": 2325.666748046875, + "completions/mean_terminated_length": 2325.666748046875, + "completions/min_length": 751.0, + "completions/min_terminated_length": 751.0, + "epoch": 0.05122116689280869, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9536461234092712, + "kl": 0.0, + "learning_rate": 4.827467218771566e-07, + "loss": -0.0068, + "num_tokens": 5137452.0, + "reward": 0.7416666746139526, + "reward_std": 0.4643779993057251, + "rewards/correctness_reward_func/mean": 0.46666666865348816, + "rewards/correctness_reward_func/std": 0.4119429290294647, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.05838742107152939, + "step": 151 + }, + { + "completion_length": 885.8333435058594, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1787.0, + "completions/max_terminated_length": 1787.0, + "completions/mean_length": 885.8333740234375, + "completions/mean_terminated_length": 885.8333740234375, + "completions/min_length": 605.0, + "completions/min_terminated_length": 605.0, + "epoch": 0.051560379918588875, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8494682908058167, + "kl": 0.0, + "learning_rate": 4.825741890959282e-07, + "loss": 0.0056, + "num_tokens": 5155720.0, + "reward": 0.9250000715255737, + "reward_std": 0.2563120126724243, + "rewards/correctness_reward_func/mean": 0.699999988079071, + "rewards/correctness_reward_func/std": 0.3357488214969635, + "rewards/format_reward_func/mean": 0.22500000894069672, + "rewards/format_reward_func/std": 0.07833494991064072, + "step": 152 + }, + { + "completion_length": 1869.3334350585938, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4111.0, + "completions/max_terminated_length": 4111.0, + "completions/mean_length": 1869.3333740234375, + "completions/mean_terminated_length": 1869.3333740234375, + "completions/min_length": 547.0, + "completions/min_terminated_length": 547.0, + "epoch": 0.05189959294436906, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1964230239391327, + "kl": 0.0, + "learning_rate": 4.824016563146997e-07, + "loss": 0.0004, + "num_tokens": 5188076.0, + "reward": 1.2000000476837158, + "reward_std": 0.08164963126182556, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 153 + }, + { + "completion_length": 1415.6666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2859.0, + "completions/max_terminated_length": 2859.0, + "completions/mean_length": 1415.666748046875, + "completions/mean_terminated_length": 1415.666748046875, + "completions/min_length": 632.0, + "completions/min_terminated_length": 632.0, + "epoch": 0.05223880597014925, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.35508859157562256, + "kl": 0.0, + "learning_rate": 4.822291235334713e-07, + "loss": 0.0054, + "num_tokens": 5217724.0, + "reward": 0.7416666746139526, + "reward_std": 0.20202915370464325, + "rewards/correctness_reward_func/mean": 0.46666669845581055, + "rewards/correctness_reward_func/std": 0.4119429290294647, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.05838742107152939, + "step": 154 + }, + { + "completion_length": 2550.166748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5538.0, + "completions/max_terminated_length": 5538.0, + "completions/mean_length": 2550.166748046875, + "completions/mean_terminated_length": 2550.166748046875, + "completions/min_length": 971.0, + "completions/min_terminated_length": 971.0, + "epoch": 0.05257801899592944, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.22098730504512787, + "kl": 0.0, + "learning_rate": 4.82056590752243e-07, + "loss": 0.0016, + "num_tokens": 5257764.0, + "reward": 1.120833396911621, + "reward_std": 0.06785397976636887, + "rewards/correctness_reward_func/mean": 0.8333333134651184, + "rewards/correctness_reward_func/std": 0.07784988731145859, + "rewards/format_reward_func/mean": 0.2875000238418579, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 155 + }, + { + "completion_length": 693.2500305175781, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1217.0, + "completions/max_terminated_length": 1217.0, + "completions/mean_length": 693.25, + "completions/mean_terminated_length": 693.25, + "completions/min_length": 331.0, + "completions/min_terminated_length": 331.0, + "epoch": 0.052917232021709636, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.05356336012482643, + "kl": 0.0, + "learning_rate": 4.818840579710144e-07, + "loss": 0.0001, + "num_tokens": 5279343.0, + "reward": 1.1166667938232422, + "reward_std": 0.040824826806783676, + "rewards/correctness_reward_func/mean": 0.8166666626930237, + "rewards/correctness_reward_func/std": 0.05773502215743065, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 156 + }, + { + "completion_length": 840.4166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1473.0, + "completions/max_terminated_length": 1473.0, + "completions/mean_length": 840.4166870117188, + "completions/mean_terminated_length": 840.4166870117188, + "completions/min_length": 476.0, + "completions/min_terminated_length": 476.0, + "epoch": 0.053256445047489824, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.38578730821609497, + "kl": 0.0, + "learning_rate": 4.817115251897861e-07, + "loss": -0.0039, + "num_tokens": 5303306.0, + "reward": 0.5416666269302368, + "reward_std": 0.24528895318508148, + "rewards/correctness_reward_func/mean": 0.2666666805744171, + "rewards/correctness_reward_func/std": 0.393892765045166, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.05838742107152939, + "step": 157 + }, + { + "completion_length": 2539.666748046875, + "completions/clipped_ratio": 0.16666666666666663, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 6584.0, + "completions/mean_length": 3637.83349609375, + "completions/mean_terminated_length": 3047.60009765625, + "completions/min_length": 1685.0, + "completions/min_terminated_length": 1685.0, + "epoch": 0.05359565807327001, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7518226504325867, + "kl": NaN, + "learning_rate": 4.815389924085576e-07, + "loss": -0.0562, + "num_tokens": 5345350.0, + "reward": 0.8791667819023132, + "reward_std": 0.4417826533317566, + "rewards/correctness_reward_func/mean": 0.6166666746139526, + "rewards/correctness_reward_func/std": 0.37618499994277954, + "rewards/format_reward_func/mean": 0.26250001788139343, + "rewards/format_reward_func/std": 0.09323723614215851, + "step": 158 + }, + { + "completion_length": 1451.6666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4533.0, + "completions/max_terminated_length": 4533.0, + "completions/mean_length": 1451.666748046875, + "completions/mean_terminated_length": 1451.666748046875, + "completions/min_length": 489.0, + "completions/min_terminated_length": 489.0, + "epoch": 0.0539348710990502, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.09454381465911865, + "kl": 0.0, + "learning_rate": 4.813664596273292e-07, + "loss": -0.0008, + "num_tokens": 5374308.0, + "reward": 1.1041667461395264, + "reward_std": 0.05571504682302475, + "rewards/correctness_reward_func/mean": 0.8166666626930237, + "rewards/correctness_reward_func/std": 0.05773502215743065, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 159 + }, + { + "completion_length": 3554.0833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5146.0, + "completions/max_terminated_length": 5146.0, + "completions/mean_length": 3554.08349609375, + "completions/mean_terminated_length": 3554.08349609375, + "completions/min_length": 1727.0, + "completions/min_terminated_length": 1727.0, + "epoch": 0.054274084124830396, + "frac_reward_zero_std": 1.0, + "grad_norm": 5.624274308502208e-07, + "kl": 0.0, + "learning_rate": 4.811939268461007e-07, + "loss": 0.0, + "num_tokens": 5428465.0, + "reward": 1.1000001430511475, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 160 + }, + { + "completion_length": 1793.2500610351562, + "completions/clipped_ratio": 0.16666666666666663, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5850.0, + "completions/mean_length": 2891.416748046875, + "completions/mean_terminated_length": 2151.900146484375, + "completions/min_length": 509.0, + "completions/min_terminated_length": 509.0, + "epoch": 0.054613297150610585, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6140305995941162, + "kl": NaN, + "learning_rate": 4.810213940648723e-07, + "loss": -0.102, + "num_tokens": 5464558.0, + "reward": 1.0166666507720947, + "reward_std": 0.3798363208770752, + "rewards/correctness_reward_func/mean": 0.7666666507720947, + "rewards/correctness_reward_func/std": 0.3700941801071167, + "rewards/format_reward_func/mean": 0.25, + "rewards/format_reward_func/std": 0.11677484214305878, + "step": 161 + }, + { + "completion_length": 2627.666748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6292.0, + "completions/max_terminated_length": 6292.0, + "completions/mean_length": 2627.666748046875, + "completions/mean_terminated_length": 2627.666748046875, + "completions/min_length": 347.0, + "completions/min_terminated_length": 347.0, + "epoch": 0.05495251017639077, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 4.808488612836439e-07, + "loss": 0.0, + "num_tokens": 5509680.0, + "reward": 0.7999999523162842, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.5, + "rewards/correctness_reward_func/std": 0.5222329497337341, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 162 + }, + { + "completion_length": 2503.8333740234375, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 4827.0, + "completions/mean_length": 3052.916748046875, + "completions/mean_terminated_length": 2731.45458984375, + "completions/min_length": 867.0, + "completions/min_terminated_length": 867.0, + "epoch": 0.05529172320217096, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7075985670089722, + "kl": NaN, + "learning_rate": 4.806763285024155e-07, + "loss": 0.0062, + "num_tokens": 5549920.0, + "reward": 0.8583333492279053, + "reward_std": 0.4465666711330414, + "rewards/correctness_reward_func/mean": 0.5833333134651184, + "rewards/correctness_reward_func/std": 0.4386619031429291, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.08660253882408142, + "step": 163 + }, + { + "completion_length": 2481.7501220703125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4697.0, + "completions/max_terminated_length": 4697.0, + "completions/mean_length": 2481.75, + "completions/mean_terminated_length": 2481.75, + "completions/min_length": 965.0, + "completions/min_terminated_length": 965.0, + "epoch": 0.05563093622795115, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6668363213539124, + "kl": 0.0, + "learning_rate": 4.80503795721187e-07, + "loss": -0.0086, + "num_tokens": 5592253.0, + "reward": 1.066666841506958, + "reward_std": 0.21493908762931824, + "rewards/correctness_reward_func/mean": 0.7666666507720947, + "rewards/correctness_reward_func/std": 0.2534608840942383, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 164 + }, + { + "completion_length": 1200.4166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2080.0, + "completions/max_terminated_length": 2080.0, + "completions/mean_length": 1200.416748046875, + "completions/mean_terminated_length": 1200.416748046875, + "completions/min_length": 654.0, + "completions/min_terminated_length": 654.0, + "epoch": 0.055970149253731345, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.0779814374473062e-07, + "kl": 0.0, + "learning_rate": 4.803312629399586e-07, + "loss": 0.0, + "num_tokens": 5621964.0, + "reward": 1.2000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 165 + }, + { + "completion_length": 1244.5000610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3143.0, + "completions/max_terminated_length": 3143.0, + "completions/mean_length": 1244.5, + "completions/mean_terminated_length": 1244.5, + "completions/min_length": 583.0, + "completions/min_terminated_length": 583.0, + "epoch": 0.056309362279511534, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.4018653631210327, + "kl": 0.0, + "learning_rate": 4.801587301587301e-07, + "loss": 0.0058, + "num_tokens": 5650680.0, + "reward": 0.833333432674408, + "reward_std": 0.20655912160873413, + "rewards/correctness_reward_func/mean": 0.5333333611488342, + "rewards/correctness_reward_func/std": 0.3938928246498108, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 166 + }, + { + "completion_length": 620.6666717529297, + "completions/clipped_ratio": 0.0, + "completions/max_length": 960.0, + "completions/max_terminated_length": 960.0, + "completions/mean_length": 620.6666870117188, + "completions/mean_terminated_length": 620.6666870117188, + "completions/min_length": 349.0, + "completions/min_terminated_length": 349.0, + "epoch": 0.05664857530529172, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.042549144476652145, + "kl": 0.0, + "learning_rate": 4.799861973775017e-07, + "loss": -0.0001, + "num_tokens": 5670530.0, + "reward": 0.7875000834465027, + "reward_std": 0.03061862289905548, + "rewards/correctness_reward_func/mean": 0.5, + "rewards/correctness_reward_func/std": 0.5222329497337341, + "rewards/format_reward_func/mean": 0.2875000238418579, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 167 + }, + { + "completion_length": 3173.7501220703125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6019.0, + "completions/max_terminated_length": 6019.0, + "completions/mean_length": 3173.75, + "completions/mean_terminated_length": 3173.75, + "completions/min_length": 1452.0, + "completions/min_terminated_length": 1452.0, + "epoch": 0.05698778833107191, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9396628141403198, + "kl": 0.0, + "learning_rate": 4.798136645962732e-07, + "loss": 0.0374, + "num_tokens": 5720045.0, + "reward": 0.9541667699813843, + "reward_std": 0.42716550827026367, + "rewards/correctness_reward_func/mean": 0.6666666269302368, + "rewards/correctness_reward_func/std": 0.4119429290294647, + "rewards/format_reward_func/mean": 0.2875000238418579, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 168 + }, + { + "completion_length": 1277.8333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1644.0, + "completions/max_terminated_length": 1644.0, + "completions/mean_length": 1277.8333740234375, + "completions/mean_terminated_length": 1277.8333740234375, + "completions/min_length": 650.0, + "completions/min_terminated_length": 650.0, + "epoch": 0.057327001356852106, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.870347088901326e-07, + "kl": 0.0, + "learning_rate": 4.796411318150448e-07, + "loss": 0.0, + "num_tokens": 5749149.0, + "reward": 1.2000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 169 + }, + { + "completion_length": 1964.0000610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3663.0, + "completions/max_terminated_length": 3663.0, + "completions/mean_length": 1964.0, + "completions/mean_terminated_length": 1964.0, + "completions/min_length": 859.0, + "completions/min_terminated_length": 859.0, + "epoch": 0.057666214382632294, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.475134015083313, + "kl": 0.0, + "learning_rate": 4.794685990338165e-07, + "loss": -0.0215, + "num_tokens": 5786289.0, + "reward": 0.7833334803581238, + "reward_std": 0.20412413775920868, + "rewards/correctness_reward_func/mean": 0.4833333492279053, + "rewards/correctness_reward_func/std": 0.43029236793518066, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 170 + }, + { + "completion_length": 1437.3333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1939.0, + "completions/max_terminated_length": 1939.0, + "completions/mean_length": 1437.3333740234375, + "completions/mean_terminated_length": 1437.3333740234375, + "completions/min_length": 742.0, + "completions/min_terminated_length": 742.0, + "epoch": 0.05800542740841248, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4975443482398987, + "kl": 0.0, + "learning_rate": 4.792960662525879e-07, + "loss": 0.0067, + "num_tokens": 5814013.0, + "reward": 0.9458333849906921, + "reward_std": 0.39763349294662476, + "rewards/correctness_reward_func/mean": 0.6833333969116211, + "rewards/correctness_reward_func/std": 0.32427075505256653, + "rewards/format_reward_func/mean": 0.26250001788139343, + "rewards/format_reward_func/std": 0.06784005463123322, + "step": 171 + }, + { + "completion_length": 3024.5833740234375, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 6300.0, + "completions/mean_length": 3573.666748046875, + "completions/mean_terminated_length": 3299.545654296875, + "completions/min_length": 1574.0, + "completions/min_terminated_length": 1574.0, + "epoch": 0.05834464043419267, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2008449286222458, + "kl": NaN, + "learning_rate": 4.791235334713596e-07, + "loss": -0.011, + "num_tokens": 5861486.0, + "reward": 0.7375000715255737, + "reward_std": 0.0853908509016037, + "rewards/correctness_reward_func/mean": 0.44999995827674866, + "rewards/correctness_reward_func/std": 0.47577688097953796, + "rewards/format_reward_func/mean": 0.2875000238418579, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 172 + }, + { + "completion_length": 2863.7501220703125, + "completions/clipped_ratio": 0.25, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5883.0, + "completions/mean_length": 4511.0, + "completions/mean_terminated_length": 3818.333251953125, + "completions/min_length": 2050.0, + "completions/min_terminated_length": 2050.0, + "epoch": 0.05868385345997286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30197957158088684, + "kl": NaN, + "learning_rate": 4.789510006901311e-07, + "loss": -0.0339, + "num_tokens": 5903651.0, + "reward": 0.22500000894069672, + "reward_std": 0.13869690895080566, + "rewards/correctness_reward_func/mean": 0.0, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.22500000894069672, + "rewards/format_reward_func/std": 0.13568010926246643, + "step": 173 + }, + { + "completion_length": 1077.8333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3142.0, + "completions/max_terminated_length": 3142.0, + "completions/mean_length": 1077.8333740234375, + "completions/mean_terminated_length": 1077.8333740234375, + "completions/min_length": 555.0, + "completions/min_terminated_length": 555.0, + "epoch": 0.059023066485753055, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 4.787784679089027e-07, + "loss": 0.0, + "num_tokens": 5927547.0, + "reward": 0.7999999523162842, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.5, + "rewards/correctness_reward_func/std": 0.5222329497337341, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 174 + }, + { + "completion_length": 2095.3334350585938, + "completions/clipped_ratio": 0.41666666666666663, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 6126.0, + "completions/mean_length": 4840.75, + "completions/mean_terminated_length": 3592.000244140625, + "completions/min_length": 1967.0, + "completions/min_terminated_length": 1967.0, + "epoch": 0.05936227951153324, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6622049808502197, + "kl": NaN, + "learning_rate": 4.786059351276742e-07, + "loss": -0.071, + "num_tokens": 5961595.0, + "reward": 0.24166667461395264, + "reward_std": 0.22453658282756805, + "rewards/correctness_reward_func/mean": 0.06666667014360428, + "rewards/correctness_reward_func/std": 0.2309401035308838, + "rewards/format_reward_func/mean": 0.17500001192092896, + "rewards/format_reward_func/std": 0.15447859466075897, + "step": 175 + }, + { + "completion_length": 2191.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3487.0, + "completions/max_terminated_length": 3487.0, + "completions/mean_length": 2191.75, + "completions/mean_terminated_length": 2191.75, + "completions/min_length": 704.0, + "completions/min_terminated_length": 704.0, + "epoch": 0.05970149253731343, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6641126275062561, + "kl": 0.0, + "learning_rate": 4.784334023464458e-07, + "loss": -0.0095, + "num_tokens": 5995228.0, + "reward": 0.9000000953674316, + "reward_std": 0.21908903121948242, + "rewards/correctness_reward_func/mean": 0.6000000238418579, + "rewards/correctness_reward_func/std": 0.36181360483169556, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 176 + }, + { + "completion_length": 2058.166748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5513.0, + "completions/max_terminated_length": 5513.0, + "completions/mean_length": 2058.166748046875, + "completions/mean_terminated_length": 2058.166748046875, + "completions/min_length": 854.0, + "completions/min_terminated_length": 854.0, + "epoch": 0.06004070556309362, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.813504522691801e-07, + "kl": 0.0, + "learning_rate": 4.782608695652174e-07, + "loss": 0.0, + "num_tokens": 6032094.0, + "reward": 1.2000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 177 + }, + { + "completion_length": 2605.0001220703125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5421.0, + "completions/max_terminated_length": 5421.0, + "completions/mean_length": 2605.0, + "completions/mean_terminated_length": 2605.0, + "completions/min_length": 1045.0, + "completions/min_terminated_length": 1045.0, + "epoch": 0.060379918588873815, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9859559535980225, + "kl": 0.0, + "learning_rate": 4.780883367839889e-07, + "loss": -0.0052, + "num_tokens": 6076770.0, + "reward": 1.0333333015441895, + "reward_std": 0.25819891691207886, + "rewards/correctness_reward_func/mean": 0.7333332896232605, + "rewards/correctness_reward_func/std": 0.3550501763820648, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 178 + }, + { + "completion_length": 2074.8334350585938, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 4515.0, + "completions/mean_length": 2623.916748046875, + "completions/mean_terminated_length": 2263.45458984375, + "completions/min_length": 1183.0, + "completions/min_terminated_length": 1183.0, + "epoch": 0.060719131614654004, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6357032656669617, + "kl": NaN, + "learning_rate": 4.779158040027605e-07, + "loss": -0.0209, + "num_tokens": 6112522.0, + "reward": 1.0916666984558105, + "reward_std": 0.26536136865615845, + "rewards/correctness_reward_func/mean": 0.8166666626930237, + "rewards/correctness_reward_func/std": 0.27579087018966675, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.08660254627466202, + "step": 179 + }, + { + "completion_length": 1129.0833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2240.0, + "completions/max_terminated_length": 2240.0, + "completions/mean_length": 1129.0833740234375, + "completions/mean_terminated_length": 1129.0833740234375, + "completions/min_length": 559.0, + "completions/min_terminated_length": 559.0, + "epoch": 0.06105834464043419, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09178594499826431, + "kl": 0.0, + "learning_rate": 4.777432712215321e-07, + "loss": -0.001, + "num_tokens": 6134249.0, + "reward": 1.25, + "reward_std": 0.09246455878019333, + "rewards/correctness_reward_func/mean": 0.949999988079071, + "rewards/correctness_reward_func/std": 0.09045339375734329, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 180 + }, + { + "completion_length": 2996.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5630.0, + "completions/max_terminated_length": 5630.0, + "completions/mean_length": 2996.5, + "completions/mean_terminated_length": 2996.5, + "completions/min_length": 794.0, + "completions/min_terminated_length": 794.0, + "epoch": 0.06139755766621438, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6492721438407898, + "kl": 0.0, + "learning_rate": 4.775707384403036e-07, + "loss": 0.0108, + "num_tokens": 6184307.0, + "reward": 0.9666668176651001, + "reward_std": 0.20655910670757294, + "rewards/correctness_reward_func/mean": 0.6666666865348816, + "rewards/correctness_reward_func/std": 0.31139957904815674, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 181 + }, + { + "completion_length": 2291.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4261.0, + "completions/max_terminated_length": 4261.0, + "completions/mean_length": 2291.25, + "completions/mean_terminated_length": 2291.25, + "completions/min_length": 632.0, + "completions/min_terminated_length": 632.0, + "epoch": 0.06173677069199457, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.625225841999054, + "kl": 0.0, + "learning_rate": 4.773982056590752e-07, + "loss": 0.0127, + "num_tokens": 6219686.0, + "reward": 0.6333333253860474, + "reward_std": 0.25819891691207886, + "rewards/correctness_reward_func/mean": 0.3333333432674408, + "rewards/correctness_reward_func/std": 0.4923659563064575, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 182 + }, + { + "completion_length": 1227.0833740234375, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 2438.0, + "completions/mean_length": 1776.166748046875, + "completions/mean_terminated_length": 1338.6363525390625, + "completions/min_length": 740.0, + "completions/min_terminated_length": 740.0, + "epoch": 0.062075983717774764, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7521762847900391, + "kl": NaN, + "learning_rate": 4.772256728778468e-07, + "loss": -0.0184, + "num_tokens": 6244383.0, + "reward": 0.9291666746139526, + "reward_std": 0.41845452785491943, + "rewards/correctness_reward_func/mean": 0.6666666865348816, + "rewards/correctness_reward_func/std": 0.31139957904815674, + "rewards/format_reward_func/mean": 0.26250001788139343, + "rewards/format_reward_func/std": 0.09323723614215851, + "step": 183 + }, + { + "completion_length": 2018.0833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6042.0, + "completions/max_terminated_length": 6042.0, + "completions/mean_length": 2018.0833740234375, + "completions/mean_terminated_length": 2018.0833740234375, + "completions/min_length": 479.0, + "completions/min_terminated_length": 479.0, + "epoch": 0.06241519674355495, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6339729428291321, + "kl": 0.0, + "learning_rate": 4.770531400966183e-07, + "loss": -0.0363, + "num_tokens": 6279580.0, + "reward": 0.7583333849906921, + "reward_std": 0.21946904063224792, + "rewards/correctness_reward_func/mean": 0.4833333492279053, + "rewards/correctness_reward_func/std": 0.43029236793518066, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.05838742107152939, + "step": 184 + }, + { + "completion_length": 2131.0000610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4085.0, + "completions/max_terminated_length": 4085.0, + "completions/mean_length": 2131.0, + "completions/mean_terminated_length": 2131.0, + "completions/min_length": 1032.0, + "completions/min_terminated_length": 1032.0, + "epoch": 0.06275440976933515, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.2719327691665967e-07, + "kl": 0.0, + "learning_rate": 4.7688060731539e-07, + "loss": 0.0, + "num_tokens": 6317164.0, + "reward": 0.7000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 185 + }, + { + "completion_length": 1139.8333435058594, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2115.0, + "completions/max_terminated_length": 2115.0, + "completions/mean_length": 1139.8333740234375, + "completions/mean_terminated_length": 1139.8333740234375, + "completions/min_length": 575.0, + "completions/min_terminated_length": 575.0, + "epoch": 0.06309362279511534, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.9935814066229796e-07, + "kl": 0.0, + "learning_rate": 4.7670807453416146e-07, + "loss": 0.0, + "num_tokens": 6345572.0, + "reward": 1.1000001430511475, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 186 + }, + { + "completion_length": 2896.0833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4331.0, + "completions/max_terminated_length": 4331.0, + "completions/mean_length": 2896.08349609375, + "completions/mean_terminated_length": 2896.08349609375, + "completions/min_length": 1513.0, + "completions/min_terminated_length": 1513.0, + "epoch": 0.06343283582089553, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.4238826174259884e-07, + "kl": 0.0, + "learning_rate": 4.76535541752933e-07, + "loss": 0.0, + "num_tokens": 6390285.0, + "reward": 0.7000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 187 + }, + { + "completion_length": 1860.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4907.0, + "completions/max_terminated_length": 4907.0, + "completions/mean_length": 1860.25, + "completions/mean_terminated_length": 1860.25, + "completions/min_length": 922.0, + "completions/min_terminated_length": 922.0, + "epoch": 0.06377204884667571, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6667431592941284, + "kl": 0.0, + "learning_rate": 4.763630089717046e-07, + "loss": 0.0349, + "num_tokens": 6425994.0, + "reward": 1.0833334922790527, + "reward_std": 0.19407902657985687, + "rewards/correctness_reward_func/mean": 0.7833333015441895, + "rewards/correctness_reward_func/std": 0.26227444410324097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 188 + }, + { + "completion_length": 1321.6666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3460.0, + "completions/max_terminated_length": 3460.0, + "completions/mean_length": 1321.666748046875, + "completions/mean_terminated_length": 1321.666748046875, + "completions/min_length": 463.0, + "completions/min_terminated_length": 463.0, + "epoch": 0.0641112618724559, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.37701964378356934, + "kl": 0.0, + "learning_rate": 4.761904761904761e-07, + "loss": 0.0042, + "num_tokens": 6448568.0, + "reward": 0.7166666984558105, + "reward_std": 0.20412413775920868, + "rewards/correctness_reward_func/mean": 0.4166666567325592, + "rewards/correctness_reward_func/std": 0.5149286985397339, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 189 + }, + { + "completion_length": 2525.33349609375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4880.0, + "completions/max_terminated_length": 4880.0, + "completions/mean_length": 2525.33349609375, + "completions/mean_terminated_length": 2525.33349609375, + "completions/min_length": 879.0, + "completions/min_terminated_length": 879.0, + "epoch": 0.06445047489823609, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.206994891166687, + "kl": 0.0, + "learning_rate": 4.7601794340924773e-07, + "loss": 0.0475, + "num_tokens": 6491160.0, + "reward": 0.8333333730697632, + "reward_std": 0.4772879481315613, + "rewards/correctness_reward_func/mean": 0.5333333015441895, + "rewards/correctness_reward_func/std": 0.47736650705337524, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 190 + }, + { + "completion_length": 994.5833435058594, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2085.0, + "completions/max_terminated_length": 2085.0, + "completions/mean_length": 994.5833740234375, + "completions/mean_terminated_length": 994.5833740234375, + "completions/min_length": 526.0, + "completions/min_terminated_length": 526.0, + "epoch": 0.06478968792401628, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.1302626862507168e-07, + "kl": 0.0, + "learning_rate": 4.758454106280193e-07, + "loss": 0.0, + "num_tokens": 6513985.0, + "reward": 0.7000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 191 + }, + { + "completion_length": 1926.0000610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3138.0, + "completions/max_terminated_length": 3138.0, + "completions/mean_length": 1926.0, + "completions/mean_terminated_length": 1926.0, + "completions/min_length": 926.0, + "completions/min_terminated_length": 926.0, + "epoch": 0.06512890094979647, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.765222859579808e-07, + "kl": 0.0, + "learning_rate": 4.756728778467909e-07, + "loss": 0.0, + "num_tokens": 6548065.0, + "reward": 0.7000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 192 + }, + { + "completion_length": 2050.666748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3179.0, + "completions/max_terminated_length": 3179.0, + "completions/mean_length": 2050.666748046875, + "completions/mean_terminated_length": 2050.666748046875, + "completions/min_length": 708.0, + "completions/min_terminated_length": 708.0, + "epoch": 0.06546811397557666, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.0529081606591717e-07, + "kl": 0.0, + "learning_rate": 4.7550034506556244e-07, + "loss": 0.0, + "num_tokens": 6586623.0, + "reward": 0.7000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 193 + }, + { + "completion_length": 2184.5833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3969.0, + "completions/max_terminated_length": 3969.0, + "completions/mean_length": 2184.58349609375, + "completions/mean_terminated_length": 2184.58349609375, + "completions/min_length": 853.0, + "completions/min_terminated_length": 853.0, + "epoch": 0.06580732700135686, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 4.75327812284334e-07, + "loss": 0.0, + "num_tokens": 6622516.0, + "reward": 0.30000001192092896, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.0, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 194 + }, + { + "completion_length": 1352.6666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2634.0, + "completions/max_terminated_length": 2634.0, + "completions/mean_length": 1352.666748046875, + "completions/mean_terminated_length": 1352.666748046875, + "completions/min_length": 597.0, + "completions/min_terminated_length": 597.0, + "epoch": 0.06614654002713705, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.10883874446153641, + "kl": 0.0, + "learning_rate": 4.751552795031056e-07, + "loss": -0.0022, + "num_tokens": 6653136.0, + "reward": 1.1166667938232422, + "reward_std": 0.040824826806783676, + "rewards/correctness_reward_func/mean": 0.8166666626930237, + "rewards/correctness_reward_func/std": 0.05773502215743065, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 195 + }, + { + "completion_length": 1927.0000610351562, + "completions/clipped_ratio": 0.16666666666666663, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 4352.0, + "completions/mean_length": 3025.166748046875, + "completions/mean_terminated_length": 2312.400146484375, + "completions/min_length": 1171.0, + "completions/min_terminated_length": 1171.0, + "epoch": 0.06648575305291723, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.4869142770767212, + "kl": NaN, + "learning_rate": 4.7498274672187715e-07, + "loss": -0.0407, + "num_tokens": 6689124.0, + "reward": 0.8500000834465027, + "reward_std": 0.279284805059433, + "rewards/correctness_reward_func/mean": 0.6000000238418579, + "rewards/correctness_reward_func/std": 0.36181360483169556, + "rewards/format_reward_func/mean": 0.2500000298023224, + "rewards/format_reward_func/std": 0.11677484214305878, + "step": 196 + }, + { + "completion_length": 1327.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2700.0, + "completions/max_terminated_length": 2700.0, + "completions/mean_length": 1327.0, + "completions/mean_terminated_length": 1327.0, + "completions/min_length": 452.0, + "completions/min_terminated_length": 452.0, + "epoch": 0.06682496607869742, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.299483849645185e-08, + "kl": 0.0, + "learning_rate": 4.748102139406487e-07, + "loss": 0.0, + "num_tokens": 6719208.0, + "reward": 1.2000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 197 + }, + { + "completion_length": 1178.8333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3020.0, + "completions/max_terminated_length": 3020.0, + "completions/mean_length": 1178.8333740234375, + "completions/mean_terminated_length": 1178.8333740234375, + "completions/min_length": 381.0, + "completions/min_terminated_length": 381.0, + "epoch": 0.06716417910447761, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.08082883059978485, + "kl": 0.0, + "learning_rate": 4.7463768115942026e-07, + "loss": -0.0002, + "num_tokens": 6742648.0, + "reward": 0.6375000476837158, + "reward_std": 0.03061862289905548, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.23750001192092896, + "rewards/format_reward_func/std": 0.07723929733037949, + "step": 198 + }, + { + "completion_length": 2507.416748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5850.0, + "completions/max_terminated_length": 5850.0, + "completions/mean_length": 2507.416748046875, + "completions/mean_terminated_length": 2507.416748046875, + "completions/min_length": 1005.0, + "completions/min_terminated_length": 1005.0, + "epoch": 0.0675033921302578, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6426955461502075, + "kl": 0.0, + "learning_rate": 4.7446514837819186e-07, + "loss": -0.0252, + "num_tokens": 6785319.0, + "reward": 1.0166666507720947, + "reward_std": 0.24013885855674744, + "rewards/correctness_reward_func/mean": 0.7166666984558105, + "rewards/correctness_reward_func/std": 0.4386618733406067, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 199 + }, + { + "completion_length": 1075.0833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2798.0, + "completions/max_terminated_length": 2798.0, + "completions/mean_length": 1075.0833740234375, + "completions/mean_terminated_length": 1075.0833740234375, + "completions/min_length": 456.0, + "completions/min_terminated_length": 456.0, + "epoch": 0.06784260515603799, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.05683635175228119, + "kl": 0.0, + "learning_rate": 4.7429261559696336e-07, + "loss": 0.0, + "num_tokens": 6809584.0, + "reward": 1.2833333015441895, + "reward_std": 0.040824808180332184, + "rewards/correctness_reward_func/mean": 0.9833333492279053, + "rewards/correctness_reward_func/std": 0.05773502215743065, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 200 + }, + { + "completion_length": 3697.5833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6119.0, + "completions/max_terminated_length": 6119.0, + "completions/mean_length": 3697.58349609375, + "completions/mean_terminated_length": 3697.58349609375, + "completions/min_length": 1100.0, + "completions/min_terminated_length": 1100.0, + "epoch": 0.06818181818181818, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.3760663364337233e-07, + "kl": 0.0, + "learning_rate": 4.7412008281573497e-07, + "loss": 0.0, + "num_tokens": 6863723.0, + "reward": 0.7000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 201 + }, + { + "completion_length": 2568.916748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4082.0, + "completions/max_terminated_length": 4082.0, + "completions/mean_length": 2568.916748046875, + "completions/mean_terminated_length": 2568.916748046875, + "completions/min_length": 1818.0, + "completions/min_terminated_length": 1818.0, + "epoch": 0.06852103120759837, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.7128849029541016, + "kl": 0.0, + "learning_rate": 4.739475500345065e-07, + "loss": -0.0046, + "num_tokens": 6904072.0, + "reward": 1.1041667461395264, + "reward_std": 0.23474274575710297, + "rewards/correctness_reward_func/mean": 0.8166666030883789, + "rewards/correctness_reward_func/std": 0.27579087018966675, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 202 + }, + { + "completion_length": 2055.416748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3684.0, + "completions/max_terminated_length": 3684.0, + "completions/mean_length": 2055.416748046875, + "completions/mean_terminated_length": 2055.416748046875, + "completions/min_length": 790.0, + "completions/min_terminated_length": 790.0, + "epoch": 0.06886024423337857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10221155732870102, + "kl": 0.0, + "learning_rate": 4.737750172532781e-07, + "loss": 0.0007, + "num_tokens": 6943305.0, + "reward": 1.0750000476837158, + "reward_std": 0.06123722717165947, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.05838742107152939, + "step": 203 + }, + { + "completion_length": 1224.2500610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3263.0, + "completions/max_terminated_length": 3263.0, + "completions/mean_length": 1224.25, + "completions/mean_terminated_length": 1224.25, + "completions/min_length": 361.0, + "completions/min_terminated_length": 361.0, + "epoch": 0.06919945725915876, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.1618932894161844e-07, + "kl": 0.0, + "learning_rate": 4.736024844720496e-07, + "loss": 0.0, + "num_tokens": 6971046.0, + "reward": 1.125, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.22500000894069672, + "rewards/format_reward_func/std": 0.07833494991064072, + "step": 204 + }, + { + "completion_length": 1673.8333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3900.0, + "completions/max_terminated_length": 3900.0, + "completions/mean_length": 1673.8333740234375, + "completions/mean_terminated_length": 1673.8333740234375, + "completions/min_length": 938.0, + "completions/min_terminated_length": 938.0, + "epoch": 0.06953867028493894, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.1687034152128035e-07, + "kl": 0.0, + "learning_rate": 4.7342995169082123e-07, + "loss": 0.0, + "num_tokens": 7004194.0, + "reward": 1.1000001430511475, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 205 + }, + { + "completion_length": 1079.5, + "completions/clipped_ratio": 0.33333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 4487.0, + "completions/mean_length": 3275.83349609375, + "completions/mean_terminated_length": 1619.25, + "completions/min_length": 581.0, + "completions/min_terminated_length": 581.0, + "epoch": 0.06987788331071913, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.931649923324585, + "kl": NaN, + "learning_rate": 4.732574189095928e-07, + "loss": -0.0345, + "num_tokens": 7028488.0, + "reward": 0.7666666507720947, + "reward_std": 0.24533745646476746, + "rewards/correctness_reward_func/mean": 0.5666666626930237, + "rewards/correctness_reward_func/std": 0.5033223032951355, + "rewards/format_reward_func/mean": 0.20000000298023224, + "rewards/format_reward_func/std": 0.133143812417984, + "step": 206 + }, + { + "completion_length": 1299.6667175292969, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3137.0, + "completions/max_terminated_length": 3137.0, + "completions/mean_length": 1299.666748046875, + "completions/mean_terminated_length": 1299.666748046875, + "completions/min_length": 391.0, + "completions/min_terminated_length": 391.0, + "epoch": 0.07021709633649932, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.07907649129629135, + "kl": 0.0, + "learning_rate": 4.730848861283644e-07, + "loss": -0.0023, + "num_tokens": 7053222.0, + "reward": 1.2833333015441895, + "reward_std": 0.040824808180332184, + "rewards/correctness_reward_func/mean": 0.9833333492279053, + "rewards/correctness_reward_func/std": 0.05773502215743065, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 207 + }, + { + "completion_length": 1989.5000610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2931.0, + "completions/max_terminated_length": 2931.0, + "completions/mean_length": 1989.5, + "completions/mean_terminated_length": 1989.5, + "completions/min_length": 709.0, + "completions/min_terminated_length": 709.0, + "epoch": 0.07055630936227951, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.503600597381592, + "kl": 0.0, + "learning_rate": 4.7291235334713594e-07, + "loss": 0.0031, + "num_tokens": 7089882.0, + "reward": 0.8500000834465027, + "reward_std": 0.2473839521408081, + "rewards/correctness_reward_func/mean": 0.550000011920929, + "rewards/correctness_reward_func/std": 0.4100997745990753, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 208 + }, + { + "completion_length": 2048.5834350585938, + "completions/clipped_ratio": 0.16666666666666663, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5063.0, + "completions/mean_length": 3146.75, + "completions/mean_terminated_length": 2458.300048828125, + "completions/min_length": 1278.0, + "completions/min_terminated_length": 1278.0, + "epoch": 0.0708955223880597, + "frac_reward_zero_std": 0.5, + "grad_norm": 6.7276811599731445, + "kl": NaN, + "learning_rate": 4.727398205659075e-07, + "loss": -0.0123, + "num_tokens": 7128205.0, + "reward": 0.6375000476837158, + "reward_std": 0.07373940199613571, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.23750001192092896, + "rewards/format_reward_func/std": 0.11894422769546509, + "step": 209 + }, + { + "completion_length": 2508.166748046875, + "completions/clipped_ratio": 0.25, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 6519.0, + "completions/mean_length": 4155.4169921875, + "completions/mean_terminated_length": 3344.22216796875, + "completions/min_length": 1361.0, + "completions/min_terminated_length": 1361.0, + "epoch": 0.07123473541383989, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.17133860290050507, + "kl": NaN, + "learning_rate": 4.725672877846791e-07, + "loss": -0.0279, + "num_tokens": 7167573.0, + "reward": 0.6250001192092896, + "reward_std": 0.08215838670730591, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.22500000894069672, + "rewards/format_reward_func/std": 0.13568010926246643, + "step": 210 + }, + { + "completion_length": 1452.0000610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3934.0, + "completions/max_terminated_length": 3934.0, + "completions/mean_length": 1452.0, + "completions/mean_terminated_length": 1452.0, + "completions/min_length": 423.0, + "completions/min_terminated_length": 423.0, + "epoch": 0.07157394843962007, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1274157166481018, + "kl": 0.0, + "learning_rate": 4.723947550034506e-07, + "loss": -0.0028, + "num_tokens": 7198827.0, + "reward": 1.2000000476837158, + "reward_std": 0.09350206702947617, + "rewards/correctness_reward_func/mean": 0.949999988079071, + "rewards/correctness_reward_func/std": 0.09045339375734329, + "rewards/format_reward_func/mean": 0.2500000298023224, + "rewards/format_reward_func/std": 0.07385490089654922, + "step": 211 + }, + { + "completion_length": 1059.5833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1710.0, + "completions/max_terminated_length": 1710.0, + "completions/mean_length": 1059.5833740234375, + "completions/mean_terminated_length": 1059.5833740234375, + "completions/min_length": 473.0, + "completions/min_terminated_length": 473.0, + "epoch": 0.07191316146540028, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.049741536378860474, + "kl": 0.0, + "learning_rate": 4.722222222222222e-07, + "loss": -0.0011, + "num_tokens": 7226206.0, + "reward": 1.1875, + "reward_std": 0.030618613585829735, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 212 + }, + { + "completion_length": 2186.7501220703125, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 4565.0, + "completions/mean_length": 2735.83349609375, + "completions/mean_terminated_length": 2385.54541015625, + "completions/min_length": 1381.0, + "completions/min_terminated_length": 1381.0, + "epoch": 0.07225237449118047, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7999017834663391, + "kl": NaN, + "learning_rate": 4.7204968944099376e-07, + "loss": -0.0145, + "num_tokens": 7262761.0, + "reward": 0.658333420753479, + "reward_std": 0.25531625747680664, + "rewards/correctness_reward_func/mean": 0.38333332538604736, + "rewards/correctness_reward_func/std": 0.4783177673816681, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.08660253882408142, + "step": 213 + }, + { + "completion_length": 2076.0, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 4194.0, + "completions/mean_length": 2625.08349609375, + "completions/mean_terminated_length": 2264.727294921875, + "completions/min_length": 1039.0, + "completions/min_terminated_length": 1039.0, + "epoch": 0.07259158751696065, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8687799572944641, + "kl": NaN, + "learning_rate": 4.7187715665976537e-07, + "loss": -0.02, + "num_tokens": 7298377.0, + "reward": 0.625, + "reward_std": 0.5025304555892944, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4972652792930603, + "rewards/format_reward_func/mean": 0.22500000894069672, + "rewards/format_reward_func/std": 0.10112998634576797, + "step": 214 + }, + { + "completion_length": 1112.5000610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2367.0, + "completions/max_terminated_length": 2367.0, + "completions/mean_length": 1112.5, + "completions/mean_terminated_length": 1112.5, + "completions/min_length": 573.0, + "completions/min_terminated_length": 573.0, + "epoch": 0.07293080054274084, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.1586325143753129e-07, + "kl": 0.0, + "learning_rate": 4.7170462387853687e-07, + "loss": 0.0, + "num_tokens": 7324171.0, + "reward": 1.2000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 215 + }, + { + "completion_length": 1243.9166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2557.0, + "completions/max_terminated_length": 2557.0, + "completions/mean_length": 1243.916748046875, + "completions/mean_terminated_length": 1243.916748046875, + "completions/min_length": 653.0, + "completions/min_terminated_length": 653.0, + "epoch": 0.07327001356852103, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.3274558484554291, + "kl": 0.0, + "learning_rate": 4.7153209109730847e-07, + "loss": -0.0002, + "num_tokens": 7354998.0, + "reward": 1.0208334922790527, + "reward_std": 0.19391795992851257, + "rewards/correctness_reward_func/mean": 0.7333333492279053, + "rewards/correctness_reward_func/std": 0.23094011843204498, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 216 + }, + { + "completion_length": 1607.0833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4015.0, + "completions/max_terminated_length": 4015.0, + "completions/mean_length": 1607.0833740234375, + "completions/mean_terminated_length": 1607.0833740234375, + "completions/min_length": 716.0, + "completions/min_terminated_length": 716.0, + "epoch": 0.07360922659430122, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.11406178027391434, + "kl": 0.0, + "learning_rate": 4.7135955831608e-07, + "loss": -0.0028, + "num_tokens": 7391251.0, + "reward": 0.7333333492279053, + "reward_std": 0.051639772951602936, + "rewards/correctness_reward_func/mean": 0.43333330750465393, + "rewards/correctness_reward_func/std": 0.45792683959007263, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 217 + }, + { + "completion_length": 774.5833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1245.0, + "completions/max_terminated_length": 1245.0, + "completions/mean_length": 774.5833740234375, + "completions/mean_terminated_length": 774.5833740234375, + "completions/min_length": 470.0, + "completions/min_terminated_length": 470.0, + "epoch": 0.07394843962008141, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0900874137878418, + "kl": 0.0, + "learning_rate": 4.7118702553485163e-07, + "loss": 0.0, + "num_tokens": 7416254.0, + "reward": 1.183333396911621, + "reward_std": 0.040824808180332184, + "rewards/correctness_reward_func/mean": 0.8833333849906921, + "rewards/correctness_reward_func/std": 0.10298572480678558, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 218 + }, + { + "completion_length": 1318.0000610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2595.0, + "completions/max_terminated_length": 2595.0, + "completions/mean_length": 1318.0, + "completions/mean_terminated_length": 1318.0, + "completions/min_length": 544.0, + "completions/min_terminated_length": 544.0, + "epoch": 0.0742876526458616, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0830475240945816, + "kl": 0.0, + "learning_rate": 4.7101449275362313e-07, + "loss": -0.0019, + "num_tokens": 7442162.0, + "reward": 1.133333444595337, + "reward_std": 0.051639772951602936, + "rewards/correctness_reward_func/mean": 0.8333333134651184, + "rewards/correctness_reward_func/std": 0.07784988731145859, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 219 + }, + { + "completion_length": 1440.7500610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3651.0, + "completions/max_terminated_length": 3651.0, + "completions/mean_length": 1440.75, + "completions/mean_terminated_length": 1440.75, + "completions/min_length": 604.0, + "completions/min_terminated_length": 604.0, + "epoch": 0.07462686567164178, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.1543527990579605, + "kl": 0.0, + "learning_rate": 4.7084195997239474e-07, + "loss": -0.0023, + "num_tokens": 7466825.0, + "reward": 1.2333333492279053, + "reward_std": 0.051639772951602936, + "rewards/correctness_reward_func/mean": 0.9333333373069763, + "rewards/correctness_reward_func/std": 0.0984731912612915, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 220 + }, + { + "completion_length": 1963.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3612.0, + "completions/max_terminated_length": 3612.0, + "completions/mean_length": 1963.0, + "completions/mean_terminated_length": 1963.0, + "completions/min_length": 696.0, + "completions/min_terminated_length": 696.0, + "epoch": 0.07496607869742199, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.09828822314739227, + "kl": 0.0, + "learning_rate": 4.706694271911663e-07, + "loss": 0.002, + "num_tokens": 7500995.0, + "reward": 1.120833396911621, + "reward_std": 0.04005204886198044, + "rewards/correctness_reward_func/mean": 0.8333333134651184, + "rewards/correctness_reward_func/std": 0.07784988731145859, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 221 + }, + { + "completion_length": 1688.8333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4377.0, + "completions/max_terminated_length": 4377.0, + "completions/mean_length": 1688.8333740234375, + "completions/mean_terminated_length": 1688.8333740234375, + "completions/min_length": 370.0, + "completions/min_terminated_length": 370.0, + "epoch": 0.07530529172320218, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.280002236366272, + "kl": 0.0, + "learning_rate": 4.704968944099379e-07, + "loss": 0.0004, + "num_tokens": 7535055.0, + "reward": 0.9333333373069763, + "reward_std": 0.20655912160873413, + "rewards/correctness_reward_func/mean": 0.6333333253860474, + "rewards/correctness_reward_func/std": 0.4735424220561981, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 222 + }, + { + "completion_length": 1618.6666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2832.0, + "completions/max_terminated_length": 2832.0, + "completions/mean_length": 1618.666748046875, + "completions/mean_terminated_length": 1618.666748046875, + "completions/min_length": 1044.0, + "completions/min_terminated_length": 1044.0, + "epoch": 0.07564450474898236, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5872350931167603, + "kl": 0.0, + "learning_rate": 4.7032436162870945e-07, + "loss": 0.0123, + "num_tokens": 7569689.0, + "reward": 1.087499976158142, + "reward_std": 0.3197711706161499, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.38138505816459656, + "rewards/format_reward_func/mean": 0.2875000238418579, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 223 + }, + { + "completion_length": 1416.8333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2253.0, + "completions/max_terminated_length": 2253.0, + "completions/mean_length": 1416.8333740234375, + "completions/mean_terminated_length": 1416.8333740234375, + "completions/min_length": 550.0, + "completions/min_terminated_length": 550.0, + "epoch": 0.07598371777476255, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.0870072603429435e-07, + "kl": 0.0, + "learning_rate": 4.70151828847481e-07, + "loss": 0.0, + "num_tokens": 7598295.0, + "reward": 1.2000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 224 + }, + { + "completion_length": 1122.7500610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2544.0, + "completions/max_terminated_length": 2544.0, + "completions/mean_length": 1122.75, + "completions/mean_terminated_length": 1122.75, + "completions/min_length": 503.0, + "completions/min_terminated_length": 503.0, + "epoch": 0.07632293080054274, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3819059133529663, + "kl": 0.0, + "learning_rate": 4.699792960662526e-07, + "loss": -0.0098, + "num_tokens": 7625124.0, + "reward": 1.0833333730697632, + "reward_std": 0.2010922133922577, + "rewards/correctness_reward_func/mean": 0.8333333134651184, + "rewards/correctness_reward_func/std": 0.2806917726993561, + "rewards/format_reward_func/mean": 0.2500000298023224, + "rewards/format_reward_func/std": 0.07385490089654922, + "step": 225 + }, + { + "completion_length": 2394.666748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3629.0, + "completions/max_terminated_length": 3629.0, + "completions/mean_length": 2394.666748046875, + "completions/mean_terminated_length": 2394.666748046875, + "completions/min_length": 1138.0, + "completions/min_terminated_length": 1138.0, + "epoch": 0.07666214382632293, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9332161545753479, + "kl": 0.0, + "learning_rate": 4.698067632850241e-07, + "loss": 0.0003, + "num_tokens": 7666064.0, + "reward": 0.9166666865348816, + "reward_std": 0.3129710853099823, + "rewards/correctness_reward_func/mean": 0.6166666150093079, + "rewards/correctness_reward_func/std": 0.4628632962703705, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 226 + }, + { + "completion_length": 864.1667175292969, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1947.0, + "completions/max_terminated_length": 1947.0, + "completions/mean_length": 864.1666870117188, + "completions/mean_terminated_length": 864.1666870117188, + "completions/min_length": 467.0, + "completions/min_terminated_length": 467.0, + "epoch": 0.07700135685210312, + "frac_reward_zero_std": 1.0, + "grad_norm": 7.138903868053603e-08, + "kl": 0.0, + "learning_rate": 4.696342305037957e-07, + "loss": 0.0, + "num_tokens": 7688386.0, + "reward": 0.7000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 227 + }, + { + "completion_length": 2737.5001220703125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4239.0, + "completions/max_terminated_length": 4239.0, + "completions/mean_length": 2737.5, + "completions/mean_terminated_length": 2737.5, + "completions/min_length": 1432.0, + "completions/min_terminated_length": 1432.0, + "epoch": 0.0773405698778833, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6406996846199036, + "kl": 0.0, + "learning_rate": 4.6946169772256726e-07, + "loss": 0.0023, + "num_tokens": 7735372.0, + "reward": 0.8000000715255737, + "reward_std": 0.24494895339012146, + "rewards/correctness_reward_func/mean": 0.5, + "rewards/correctness_reward_func/std": 0.4472135901451111, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 228 + }, + { + "completion_length": 1794.166748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5741.0, + "completions/max_terminated_length": 5741.0, + "completions/mean_length": 1794.166748046875, + "completions/mean_terminated_length": 1794.166748046875, + "completions/min_length": 699.0, + "completions/min_terminated_length": 699.0, + "epoch": 0.0776797829036635, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.823469458235195e-07, + "kl": 0.0, + "learning_rate": 4.6928916494133887e-07, + "loss": 0.0, + "num_tokens": 7763172.0, + "reward": 1.1000001430511475, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 229 + }, + { + "completion_length": 2012.1666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3545.0, + "completions/max_terminated_length": 3545.0, + "completions/mean_length": 2012.166748046875, + "completions/mean_terminated_length": 2012.166748046875, + "completions/min_length": 992.0, + "completions/min_terminated_length": 992.0, + "epoch": 0.0780189959294437, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.5563703775405884, + "kl": 0.0, + "learning_rate": 4.6911663216011037e-07, + "loss": -0.0003, + "num_tokens": 7796900.0, + "reward": 0.8500000834465027, + "reward_std": 0.2345207929611206, + "rewards/correctness_reward_func/mean": 0.550000011920929, + "rewards/correctness_reward_func/std": 0.4100997745990753, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 230 + }, + { + "completion_length": 1420.7500610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2537.0, + "completions/max_terminated_length": 2537.0, + "completions/mean_length": 1420.75, + "completions/mean_terminated_length": 1420.75, + "completions/min_length": 1000.0, + "completions/min_terminated_length": 1000.0, + "epoch": 0.07835820895522388, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.374015748500824, + "kl": 0.0, + "learning_rate": 4.68944099378882e-07, + "loss": -0.002, + "num_tokens": 7822997.0, + "reward": 0.6541666984558105, + "reward_std": 0.21588000655174255, + "rewards/correctness_reward_func/mean": 0.36666667461395264, + "rewards/correctness_reward_func/std": 0.45792683959007263, + "rewards/format_reward_func/mean": 0.2875000238418579, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 231 + }, + { + "completion_length": 1682.1666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4546.0, + "completions/max_terminated_length": 4546.0, + "completions/mean_length": 1682.166748046875, + "completions/mean_terminated_length": 1682.166748046875, + "completions/min_length": 342.0, + "completions/min_terminated_length": 342.0, + "epoch": 0.07869742198100407, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.07696101814508438, + "kl": 0.0, + "learning_rate": 4.6877156659765353e-07, + "loss": -0.0003, + "num_tokens": 7851037.0, + "reward": 0.7875000834465027, + "reward_std": 0.03061862289905548, + "rewards/correctness_reward_func/mean": 0.5, + "rewards/correctness_reward_func/std": 0.5222329497337341, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 232 + }, + { + "completion_length": 1198.5833740234375, + "completions/clipped_ratio": 0.33333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 6101.0, + "completions/mean_length": 3394.916748046875, + "completions/mean_terminated_length": 1797.875, + "completions/min_length": 535.0, + "completions/min_terminated_length": 535.0, + "epoch": 0.07903663500678426, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.621031403541565, + "kl": NaN, + "learning_rate": 4.6859903381642513e-07, + "loss": -0.0886, + "num_tokens": 7876436.0, + "reward": 0.7625000476837158, + "reward_std": 0.3166946768760681, + "rewards/correctness_reward_func/mean": 0.550000011920929, + "rewards/correctness_reward_func/std": 0.4100997745990753, + "rewards/format_reward_func/mean": 0.21250002086162567, + "rewards/format_reward_func/std": 0.13505050539970398, + "step": 233 + }, + { + "completion_length": 2415.08349609375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5319.0, + "completions/max_terminated_length": 5319.0, + "completions/mean_length": 2415.08349609375, + "completions/mean_terminated_length": 2415.08349609375, + "completions/min_length": 1029.0, + "completions/min_terminated_length": 1029.0, + "epoch": 0.07937584803256445, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.14514191448688507, + "kl": 0.0, + "learning_rate": 4.6842650103519663e-07, + "loss": -0.0108, + "num_tokens": 7914051.0, + "reward": 0.7333333492279053, + "reward_std": 0.051639772951602936, + "rewards/correctness_reward_func/mean": 0.43333330750465393, + "rewards/correctness_reward_func/std": 0.45792683959007263, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 234 + }, + { + "completion_length": 1313.7500610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3703.0, + "completions/max_terminated_length": 3703.0, + "completions/mean_length": 1313.75, + "completions/mean_terminated_length": 1313.75, + "completions/min_length": 428.0, + "completions/min_terminated_length": 428.0, + "epoch": 0.07971506105834464, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3174349069595337, + "kl": 0.0, + "learning_rate": 4.6825396825396824e-07, + "loss": 0.001, + "num_tokens": 7941624.0, + "reward": 0.7125000357627869, + "reward_std": 0.09585144370794296, + "rewards/correctness_reward_func/mean": 0.45000001788139343, + "rewards/correctness_reward_func/std": 0.47577688097953796, + "rewards/format_reward_func/mean": 0.26250001788139343, + "rewards/format_reward_func/std": 0.06784005463123322, + "step": 235 + }, + { + "completion_length": 800.3333435058594, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1525.0, + "completions/max_terminated_length": 1525.0, + "completions/mean_length": 800.3333740234375, + "completions/mean_terminated_length": 800.3333740234375, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "epoch": 0.08005427408412483, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 4.680814354727398e-07, + "loss": 0.0, + "num_tokens": 7963864.0, + "reward": 0.7999999523162842, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.5, + "rewards/correctness_reward_func/std": 0.5222329497337341, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 236 + }, + { + "completion_length": 2569.166748046875, + "completions/clipped_ratio": 0.16666666666666663, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 4853.0, + "completions/mean_length": 3667.33349609375, + "completions/mean_terminated_length": 3083.0, + "completions/min_length": 1522.0, + "completions/min_terminated_length": 1522.0, + "epoch": 0.08039348710990502, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3099312782287598, + "kl": NaN, + "learning_rate": 4.6790890269151135e-07, + "loss": -0.0863, + "num_tokens": 8008752.0, + "reward": 0.9833334684371948, + "reward_std": 0.35421618819236755, + "rewards/correctness_reward_func/mean": 0.7333332896232605, + "rewards/correctness_reward_func/std": 0.3550501763820648, + "rewards/format_reward_func/mean": 0.2500000298023224, + "rewards/format_reward_func/std": 0.11677484214305878, + "step": 237 + }, + { + "completion_length": 883.4166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1392.0, + "completions/max_terminated_length": 1392.0, + "completions/mean_length": 883.4166870117188, + "completions/mean_terminated_length": 883.4166870117188, + "completions/min_length": 577.0, + "completions/min_terminated_length": 577.0, + "epoch": 0.0807327001356852, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08914493024349213, + "kl": 0.0, + "learning_rate": 4.677363699102829e-07, + "loss": 0.001, + "num_tokens": 8031389.0, + "reward": 0.6916667819023132, + "reward_std": 0.07955466210842133, + "rewards/correctness_reward_func/mean": 0.4166666567325592, + "rewards/correctness_reward_func/std": 0.4386619031429291, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.05838742107152939, + "step": 238 + }, + { + "completion_length": 1487.2500610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3340.0, + "completions/max_terminated_length": 3340.0, + "completions/mean_length": 1487.25, + "completions/mean_terminated_length": 1487.25, + "completions/min_length": 669.0, + "completions/min_terminated_length": 669.0, + "epoch": 0.0810719131614654, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 4.675638371290545e-07, + "loss": 0.0, + "num_tokens": 8063738.0, + "reward": 0.7999999523162842, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.5, + "rewards/correctness_reward_func/std": 0.5222329497337341, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 239 + }, + { + "completion_length": 3740.416748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5830.0, + "completions/max_terminated_length": 5830.0, + "completions/mean_length": 3740.416748046875, + "completions/mean_terminated_length": 3740.416748046875, + "completions/min_length": 1517.0, + "completions/min_terminated_length": 1517.0, + "epoch": 0.0814111261872456, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.10733194649219513, + "kl": 0.0, + "learning_rate": 4.673913043478261e-07, + "loss": -0.005, + "num_tokens": 8124595.0, + "reward": 0.7166666984558105, + "reward_std": 0.040824826806783676, + "rewards/correctness_reward_func/mean": 0.4166666567325592, + "rewards/correctness_reward_func/std": 0.4386619031429291, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 240 + }, + { + "completion_length": 1372.8333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3355.0, + "completions/max_terminated_length": 3355.0, + "completions/mean_length": 1372.8333740234375, + "completions/mean_terminated_length": 1372.8333740234375, + "completions/min_length": 651.0, + "completions/min_terminated_length": 651.0, + "epoch": 0.08175033921302578, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.8452433298298274e-07, + "kl": 0.0, + "learning_rate": 4.672187715665976e-07, + "loss": 0.0, + "num_tokens": 8154857.0, + "reward": 1.1000001430511475, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 241 + }, + { + "completion_length": 2199.2500610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5927.0, + "completions/max_terminated_length": 5927.0, + "completions/mean_length": 2199.25, + "completions/mean_terminated_length": 2199.25, + "completions/min_length": 365.0, + "completions/min_terminated_length": 365.0, + "epoch": 0.08208955223880597, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.19338767230510712, + "kl": 0.0, + "learning_rate": 4.670462387853692e-07, + "loss": -0.0023, + "num_tokens": 8192258.0, + "reward": 0.7333333492279053, + "reward_std": 0.05163976177573204, + "rewards/correctness_reward_func/mean": 0.43333330750465393, + "rewards/correctness_reward_func/std": 0.45792683959007263, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 242 + }, + { + "completion_length": 1042.7500305175781, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2626.0, + "completions/max_terminated_length": 2626.0, + "completions/mean_length": 1042.75, + "completions/mean_terminated_length": 1042.75, + "completions/min_length": 455.0, + "completions/min_terminated_length": 455.0, + "epoch": 0.08242876526458616, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.4441406726837158, + "kl": 0.0, + "learning_rate": 4.6687370600414077e-07, + "loss": -0.0016, + "num_tokens": 8216441.0, + "reward": 1.1000001430511475, + "reward_std": 0.20000001788139343, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.2696799635887146, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 243 + }, + { + "completion_length": 1917.916748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5127.0, + "completions/max_terminated_length": 5127.0, + "completions/mean_length": 1917.916748046875, + "completions/mean_terminated_length": 1917.916748046875, + "completions/min_length": 409.0, + "completions/min_terminated_length": 409.0, + "epoch": 0.08276797829036635, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.564266796094671e-07, + "kl": 0.0, + "learning_rate": 4.667011732229124e-07, + "loss": 0.0, + "num_tokens": 8256160.0, + "reward": 1.2000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 244 + }, + { + "completion_length": 1719.0833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2907.0, + "completions/max_terminated_length": 2907.0, + "completions/mean_length": 1719.0833740234375, + "completions/mean_terminated_length": 1719.0833740234375, + "completions/min_length": 848.0, + "completions/min_terminated_length": 848.0, + "epoch": 0.08310719131614654, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6577960848808289, + "kl": 0.0, + "learning_rate": 4.665286404416839e-07, + "loss": -0.0004, + "num_tokens": 8285387.0, + "reward": 1.0374999046325684, + "reward_std": 0.3044798970222473, + "rewards/correctness_reward_func/mean": 0.75, + "rewards/correctness_reward_func/std": 0.45226702094078064, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 245 + }, + { + "completion_length": 2581.7501220703125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4833.0, + "completions/max_terminated_length": 4833.0, + "completions/mean_length": 2581.75, + "completions/mean_terminated_length": 2581.75, + "completions/min_length": 904.0, + "completions/min_terminated_length": 904.0, + "epoch": 0.08344640434192672, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 4.663561076604555e-07, + "loss": 0.0, + "num_tokens": 8325866.0, + "reward": 0.7999999523162842, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.5, + "rewards/correctness_reward_func/std": 0.5222329497337341, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 246 + }, + { + "completion_length": 1798.5000610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3059.0, + "completions/max_terminated_length": 3059.0, + "completions/mean_length": 1798.5, + "completions/mean_terminated_length": 1798.5, + "completions/min_length": 750.0, + "completions/min_terminated_length": 750.0, + "epoch": 0.08378561736770691, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.0002641437931743e-07, + "kl": 0.0, + "learning_rate": 4.6618357487922703e-07, + "loss": 0.0, + "num_tokens": 8360000.0, + "reward": 1.1000001430511475, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 247 + }, + { + "completion_length": 1804.666748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3874.0, + "completions/max_terminated_length": 3874.0, + "completions/mean_length": 1804.666748046875, + "completions/mean_terminated_length": 1804.666748046875, + "completions/min_length": 550.0, + "completions/min_terminated_length": 550.0, + "epoch": 0.08412483039348712, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.7278051376342773, + "kl": 0.0, + "learning_rate": 4.660110420979986e-07, + "loss": -0.0089, + "num_tokens": 8396068.0, + "reward": 0.6333333253860474, + "reward_std": 0.25819891691207886, + "rewards/correctness_reward_func/mean": 0.3333333432674408, + "rewards/correctness_reward_func/std": 0.4923659563064575, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 248 + }, + { + "completion_length": 1632.5000610351562, + "completions/clipped_ratio": 0.16666666666666663, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 6421.0, + "completions/mean_length": 2730.666748046875, + "completions/mean_terminated_length": 1959.0, + "completions/min_length": 531.0, + "completions/min_terminated_length": 531.0, + "epoch": 0.0844640434192673, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.0582363605499268, + "kl": NaN, + "learning_rate": 4.6583850931677014e-07, + "loss": -0.0822, + "num_tokens": 8426572.0, + "reward": 0.9833334684371948, + "reward_std": 0.3356585204601288, + "rewards/correctness_reward_func/mean": 0.7333332896232605, + "rewards/correctness_reward_func/std": 0.35505014657974243, + "rewards/format_reward_func/mean": 0.25, + "rewards/format_reward_func/std": 0.11677484214305878, + "step": 249 + }, + { + "completion_length": 936.4166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2077.0, + "completions/max_terminated_length": 2077.0, + "completions/mean_length": 936.4166870117188, + "completions/mean_terminated_length": 936.4166870117188, + "completions/min_length": 527.0, + "completions/min_terminated_length": 527.0, + "epoch": 0.08480325644504749, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09999001771211624, + "kl": 0.0, + "learning_rate": 4.6566597653554174e-07, + "loss": -0.0015, + "num_tokens": 8449605.0, + "reward": 1.0916666984558105, + "reward_std": 0.07955464720726013, + "rewards/correctness_reward_func/mean": 0.8166666626930237, + "rewards/correctness_reward_func/std": 0.05773502215743065, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.05838742107152939, + "step": 250 + }, + { + "completion_length": 1101.3333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2416.0, + "completions/max_terminated_length": 2416.0, + "completions/mean_length": 1101.3333740234375, + "completions/mean_terminated_length": 1101.3333740234375, + "completions/min_length": 439.0, + "completions/min_terminated_length": 439.0, + "epoch": 0.08514246947082768, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.057271238416433334, + "kl": 0.0, + "learning_rate": 4.654934437543133e-07, + "loss": 0.0001, + "num_tokens": 8478667.0, + "reward": 1.0750000476837158, + "reward_std": 0.038729824125766754, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.05838742107152939, + "step": 251 + }, + { + "completion_length": 2234.416748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3281.0, + "completions/max_terminated_length": 3281.0, + "completions/mean_length": 2234.416748046875, + "completions/mean_terminated_length": 2234.416748046875, + "completions/min_length": 849.0, + "completions/min_terminated_length": 849.0, + "epoch": 0.08548168249660787, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.4393929126963485e-07, + "kl": 0.0, + "learning_rate": 4.6532091097308485e-07, + "loss": 0.0, + "num_tokens": 8519382.0, + "reward": 1.1000001430511475, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 252 + }, + { + "completion_length": 2895.8333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5231.0, + "completions/max_terminated_length": 5231.0, + "completions/mean_length": 2895.83349609375, + "completions/mean_terminated_length": 2895.83349609375, + "completions/min_length": 1011.0, + "completions/min_terminated_length": 1011.0, + "epoch": 0.08582089552238806, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6422632336616516, + "kl": 0.0, + "learning_rate": 4.651483781918564e-07, + "loss": -0.0129, + "num_tokens": 8563696.0, + "reward": 0.9500000476837158, + "reward_std": 0.2917786240577698, + "rewards/correctness_reward_func/mean": 0.6500000357627869, + "rewards/correctness_reward_func/std": 0.40113475918769836, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 253 + }, + { + "completion_length": 2033.416748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3983.0, + "completions/max_terminated_length": 3983.0, + "completions/mean_length": 2033.416748046875, + "completions/mean_terminated_length": 2033.416748046875, + "completions/min_length": 619.0, + "completions/min_terminated_length": 619.0, + "epoch": 0.08616010854816825, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.05713088810443878, + "kl": 0.0, + "learning_rate": 4.64975845410628e-07, + "loss": -0.0011, + "num_tokens": 8598651.0, + "reward": 0.6875001788139343, + "reward_std": 0.03061862289905548, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.2875000238418579, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 254 + }, + { + "completion_length": 1276.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2436.0, + "completions/max_terminated_length": 2436.0, + "completions/mean_length": 1276.0, + "completions/mean_terminated_length": 1276.0, + "completions/min_length": 727.0, + "completions/min_terminated_length": 727.0, + "epoch": 0.08649932157394843, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.0110948046058184e-07, + "kl": 0.0, + "learning_rate": 4.648033126293996e-07, + "loss": 0.0, + "num_tokens": 8624469.0, + "reward": 0.7000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 255 + }, + { + "completion_length": 2234.0833740234375, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 4899.0, + "completions/mean_length": 2783.166748046875, + "completions/mean_terminated_length": 2437.181884765625, + "completions/min_length": 444.0, + "completions/min_terminated_length": 444.0, + "epoch": 0.08683853459972862, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11855963617563248, + "kl": NaN, + "learning_rate": 4.646307798481711e-07, + "loss": -0.0076, + "num_tokens": 8665972.0, + "reward": 0.7749999761581421, + "reward_std": 0.06123725324869156, + "rewards/correctness_reward_func/mean": 0.5, + "rewards/correctness_reward_func/std": 0.5222329497337341, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.05838742107152939, + "step": 256 + }, + { + "completion_length": 2740.2501220703125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5840.0, + "completions/max_terminated_length": 5840.0, + "completions/mean_length": 2740.25, + "completions/mean_terminated_length": 2740.25, + "completions/min_length": 1029.0, + "completions/min_terminated_length": 1029.0, + "epoch": 0.08717774762550883, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 4.644582470669427e-07, + "loss": 0.0, + "num_tokens": 8714371.0, + "reward": 0.30000001192092896, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.0, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 257 + }, + { + "completion_length": 3532.8333740234375, + "completions/clipped_ratio": 0.25, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 6254.0, + "completions/mean_length": 5180.08349609375, + "completions/mean_terminated_length": 4710.4443359375, + "completions/min_length": 3218.0, + "completions/min_terminated_length": 3218.0, + "epoch": 0.08751696065128901, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22339831292629242, + "kl": NaN, + "learning_rate": 4.6428571428571427e-07, + "loss": -0.016, + "num_tokens": 8766629.0, + "reward": 0.6750000715255737, + "reward_std": 0.1218542754650116, + "rewards/correctness_reward_func/mean": 0.44999995827674866, + "rewards/correctness_reward_func/std": 0.47577688097953796, + "rewards/format_reward_func/mean": 0.22500000894069672, + "rewards/format_reward_func/std": 0.11965861916542053, + "step": 258 + }, + { + "completion_length": 1992.666748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3612.0, + "completions/max_terminated_length": 3612.0, + "completions/mean_length": 1992.666748046875, + "completions/mean_terminated_length": 1992.666748046875, + "completions/min_length": 869.0, + "completions/min_terminated_length": 869.0, + "epoch": 0.0878561736770692, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.47433897852897644, + "kl": 0.0, + "learning_rate": 4.641131815044858e-07, + "loss": 0.0026, + "num_tokens": 8805757.0, + "reward": 0.5541666746139526, + "reward_std": 0.2371777594089508, + "rewards/correctness_reward_func/mean": 0.2666666805744171, + "rewards/correctness_reward_func/std": 0.393892765045166, + "rewards/format_reward_func/mean": 0.2875000238418579, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 259 + }, + { + "completion_length": 2260.416748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5259.0, + "completions/max_terminated_length": 5259.0, + "completions/mean_length": 2260.416748046875, + "completions/mean_terminated_length": 2260.416748046875, + "completions/min_length": 1118.0, + "completions/min_terminated_length": 1118.0, + "epoch": 0.08819538670284939, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.11118398606777191, + "kl": 0.0, + "learning_rate": 4.639406487232574e-07, + "loss": 0.0031, + "num_tokens": 8846226.0, + "reward": 1.1166667938232422, + "reward_std": 0.040824826806783676, + "rewards/correctness_reward_func/mean": 0.8166666626930237, + "rewards/correctness_reward_func/std": 0.05773502215743065, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 260 + }, + { + "completion_length": 2100.5001220703125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3065.0, + "completions/max_terminated_length": 3065.0, + "completions/mean_length": 2100.5, + "completions/mean_terminated_length": 2100.5, + "completions/min_length": 1150.0, + "completions/min_terminated_length": 1150.0, + "epoch": 0.08853459972862958, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.14715582132339478, + "kl": 0.0, + "learning_rate": 4.63768115942029e-07, + "loss": 0.0013, + "num_tokens": 8881794.0, + "reward": 1.1041667461395264, + "reward_std": 0.05571504682302475, + "rewards/correctness_reward_func/mean": 0.8166666626930237, + "rewards/correctness_reward_func/std": 0.05773502215743065, + "rewards/format_reward_func/mean": 0.2875000238418579, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 261 + }, + { + "completion_length": 1970.916748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3088.0, + "completions/max_terminated_length": 3088.0, + "completions/mean_length": 1970.916748046875, + "completions/mean_terminated_length": 1970.916748046875, + "completions/min_length": 860.0, + "completions/min_terminated_length": 860.0, + "epoch": 0.08887381275440977, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.0765658348409488e-07, + "kl": 0.0, + "learning_rate": 4.6359558316080054e-07, + "loss": 0.0, + "num_tokens": 8914019.0, + "reward": 1.1000001430511475, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 262 + }, + { + "completion_length": 1055.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2481.0, + "completions/max_terminated_length": 2481.0, + "completions/mean_length": 1055.75, + "completions/mean_terminated_length": 1055.75, + "completions/min_length": 438.0, + "completions/min_terminated_length": 438.0, + "epoch": 0.08921302578018996, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.059661220759153366, + "kl": 0.0, + "learning_rate": 4.634230503795721e-07, + "loss": -0.0, + "num_tokens": 8938184.0, + "reward": 0.6875001788139343, + "reward_std": 0.03061862289905548, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 263 + }, + { + "completion_length": 2159.6666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3540.0, + "completions/max_terminated_length": 3540.0, + "completions/mean_length": 2159.666748046875, + "completions/mean_terminated_length": 2159.666748046875, + "completions/min_length": 879.0, + "completions/min_terminated_length": 879.0, + "epoch": 0.08955223880597014, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.261990263401458e-07, + "kl": 0.0, + "learning_rate": 4.6325051759834364e-07, + "loss": 0.0, + "num_tokens": 8976838.0, + "reward": 0.7000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 264 + }, + { + "completion_length": 1059.0000305175781, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2702.0, + "completions/max_terminated_length": 2702.0, + "completions/mean_length": 1059.0, + "completions/mean_terminated_length": 1059.0, + "completions/min_length": 335.0, + "completions/min_terminated_length": 335.0, + "epoch": 0.08989145183175033, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.0293516911351617e-07, + "kl": 0.0, + "learning_rate": 4.6307798481711525e-07, + "loss": 0.0, + "num_tokens": 9005260.0, + "reward": 1.1000001430511475, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 265 + }, + { + "completion_length": 1051.0000610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1806.0, + "completions/max_terminated_length": 1806.0, + "completions/mean_length": 1051.0, + "completions/mean_terminated_length": 1051.0, + "completions/min_length": 509.0, + "completions/min_terminated_length": 509.0, + "epoch": 0.09023066485753053, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.35058966279029846, + "kl": 0.0, + "learning_rate": 4.629054520358868e-07, + "loss": -0.0082, + "num_tokens": 9026614.0, + "reward": 0.7833334803581238, + "reward_std": 0.20412413775920868, + "rewards/correctness_reward_func/mean": 0.4833333194255829, + "rewards/correctness_reward_func/std": 0.43029236793518066, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 266 + }, + { + "completion_length": 1659.2500610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3826.0, + "completions/max_terminated_length": 3826.0, + "completions/mean_length": 1659.25, + "completions/mean_terminated_length": 1659.25, + "completions/min_length": 695.0, + "completions/min_terminated_length": 695.0, + "epoch": 0.09056987788331072, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.540415346622467, + "kl": 0.0, + "learning_rate": 4.6273291925465835e-07, + "loss": 0.0041, + "num_tokens": 9057775.0, + "reward": 1.0833334922790527, + "reward_std": 0.19407902657985687, + "rewards/correctness_reward_func/mean": 0.7833333015441895, + "rewards/correctness_reward_func/std": 0.26227444410324097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 267 + }, + { + "completion_length": 2549.916748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3837.0, + "completions/max_terminated_length": 3837.0, + "completions/mean_length": 2549.916748046875, + "completions/mean_terminated_length": 2549.916748046875, + "completions/min_length": 1170.0, + "completions/min_terminated_length": 1170.0, + "epoch": 0.09090909090909091, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.09202612936496735, + "kl": 0.0, + "learning_rate": 4.625603864734299e-07, + "loss": -0.002, + "num_tokens": 9100188.0, + "reward": 0.7166666984558105, + "reward_std": 0.040824826806783676, + "rewards/correctness_reward_func/mean": 0.4166666567325592, + "rewards/correctness_reward_func/std": 0.4386619031429291, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 268 + }, + { + "completion_length": 2686.0001220703125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6176.0, + "completions/max_terminated_length": 6176.0, + "completions/mean_length": 2686.0, + "completions/mean_terminated_length": 2686.0, + "completions/min_length": 1286.0, + "completions/min_terminated_length": 1286.0, + "epoch": 0.0912483039348711, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.11565620452165604, + "kl": 0.0, + "learning_rate": 4.623878536922015e-07, + "loss": 0.0012, + "num_tokens": 9148038.0, + "reward": 1.1500000953674316, + "reward_std": 0.05477223917841911, + "rewards/correctness_reward_func/mean": 0.8500000834465027, + "rewards/correctness_reward_func/std": 0.09045339375734329, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 269 + }, + { + "completion_length": 1715.5833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4031.0, + "completions/max_terminated_length": 4031.0, + "completions/mean_length": 1715.5833740234375, + "completions/mean_terminated_length": 1715.5833740234375, + "completions/min_length": 474.0, + "completions/min_terminated_length": 474.0, + "epoch": 0.09158751696065129, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.7160343527793884, + "kl": 0.0, + "learning_rate": 4.6221532091097307e-07, + "loss": -0.0202, + "num_tokens": 9179917.0, + "reward": 0.4333333671092987, + "reward_std": 0.20655912160873413, + "rewards/correctness_reward_func/mean": 0.13333334028720856, + "rewards/correctness_reward_func/std": 0.31139957904815674, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 270 + }, + { + "completion_length": 2181.0, + "completions/clipped_ratio": 0.41666666666666663, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 6022.0, + "completions/mean_length": 4926.4169921875, + "completions/mean_terminated_length": 3738.857421875, + "completions/min_length": 1328.0, + "completions/min_terminated_length": 1328.0, + "epoch": 0.09192672998643148, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.20487481355667114, + "kl": NaN, + "learning_rate": 4.620427881297446e-07, + "loss": -0.019, + "num_tokens": 9219079.0, + "reward": 0.17500001192092896, + "reward_std": 0.06123724579811096, + "rewards/correctness_reward_func/mean": 0.0, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.17500001192092896, + "rewards/format_reward_func/std": 0.15447859466075897, + "step": 271 + }, + { + "completion_length": 2217.916748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3837.0, + "completions/max_terminated_length": 3837.0, + "completions/mean_length": 2217.916748046875, + "completions/mean_terminated_length": 2217.916748046875, + "completions/min_length": 841.0, + "completions/min_terminated_length": 841.0, + "epoch": 0.09226594301221167, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6833940744400024, + "kl": 0.0, + "learning_rate": 4.618702553485162e-07, + "loss": 0.0011, + "num_tokens": 9260148.0, + "reward": 0.6000000238418579, + "reward_std": 0.23664319515228271, + "rewards/correctness_reward_func/mean": 0.29999998211860657, + "rewards/correctness_reward_func/std": 0.4472135901451111, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 272 + }, + { + "completion_length": 3103.5001220703125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4943.0, + "completions/max_terminated_length": 4943.0, + "completions/mean_length": 3103.5, + "completions/mean_terminated_length": 3103.5, + "completions/min_length": 2078.0, + "completions/min_terminated_length": 2078.0, + "epoch": 0.09260515603799185, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4680456817150116, + "kl": 0.0, + "learning_rate": 4.616977225672878e-07, + "loss": -0.0159, + "num_tokens": 9307752.0, + "reward": 1.070833444595337, + "reward_std": 0.2576434314250946, + "rewards/correctness_reward_func/mean": 0.7833333015441895, + "rewards/correctness_reward_func/std": 0.26227444410324097, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 273 + }, + { + "completion_length": 1276.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3375.0, + "completions/max_terminated_length": 3375.0, + "completions/mean_length": 1276.75, + "completions/mean_terminated_length": 1276.75, + "completions/min_length": 541.0, + "completions/min_terminated_length": 541.0, + "epoch": 0.09294436906377204, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.443483992919937e-07, + "kl": 0.0, + "learning_rate": 4.6152518978605933e-07, + "loss": 0.0, + "num_tokens": 9331791.0, + "reward": 1.1000001430511475, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 274 + }, + { + "completion_length": 2395.916748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4769.0, + "completions/max_terminated_length": 4769.0, + "completions/mean_length": 2395.916748046875, + "completions/mean_terminated_length": 2395.916748046875, + "completions/min_length": 1052.0, + "completions/min_terminated_length": 1052.0, + "epoch": 0.09328358208955224, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8109861612319946, + "kl": 0.0, + "learning_rate": 4.613526570048309e-07, + "loss": 0.0024, + "num_tokens": 9368888.0, + "reward": 0.6375000476837158, + "reward_std": 0.4031320810317993, + "rewards/correctness_reward_func/mean": 0.3499999940395355, + "rewards/correctness_reward_func/std": 0.4358898997306824, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 275 + }, + { + "completion_length": 2529.916748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5854.0, + "completions/max_terminated_length": 5854.0, + "completions/mean_length": 2529.916748046875, + "completions/mean_terminated_length": 2529.916748046875, + "completions/min_length": 1131.0, + "completions/min_terminated_length": 1131.0, + "epoch": 0.09362279511533243, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.44502562284469604, + "kl": 0.0, + "learning_rate": 4.611801242236025e-07, + "loss": 0.0014, + "num_tokens": 9407575.0, + "reward": 1.0499999523162842, + "reward_std": 0.24738392233848572, + "rewards/correctness_reward_func/mean": 0.75, + "rewards/correctness_reward_func/std": 0.36306774616241455, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 276 + }, + { + "completion_length": 2848.3333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4154.0, + "completions/max_terminated_length": 4154.0, + "completions/mean_length": 2848.33349609375, + "completions/mean_terminated_length": 2848.33349609375, + "completions/min_length": 1579.0, + "completions/min_terminated_length": 1579.0, + "epoch": 0.09396200814111262, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13393691182136536, + "kl": 0.0, + "learning_rate": 4.6100759144237404e-07, + "loss": -0.0004, + "num_tokens": 9455825.0, + "reward": 0.2625000476837158, + "reward_std": 0.06934845447540283, + "rewards/correctness_reward_func/mean": 0.0, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.26250001788139343, + "rewards/format_reward_func/std": 0.06784005463123322, + "step": 277 + }, + { + "completion_length": 3776.666748046875, + "completions/clipped_ratio": 0.25, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 6433.0, + "completions/mean_length": 5423.9169921875, + "completions/mean_terminated_length": 5035.5556640625, + "completions/min_length": 3525.0, + "completions/min_terminated_length": 3525.0, + "epoch": 0.09430122116689281, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8137674927711487, + "kl": NaN, + "learning_rate": 4.608350586611456e-07, + "loss": -0.0881, + "num_tokens": 9508333.0, + "reward": 0.720833420753479, + "reward_std": 0.4936787486076355, + "rewards/correctness_reward_func/mean": 0.4833333492279053, + "rewards/correctness_reward_func/std": 0.43029236793518066, + "rewards/format_reward_func/mean": 0.23750001192092896, + "rewards/format_reward_func/std": 0.11894422769546509, + "step": 278 + }, + { + "completion_length": 2009.3333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5969.0, + "completions/max_terminated_length": 5969.0, + "completions/mean_length": 2009.3333740234375, + "completions/mean_terminated_length": 2009.3333740234375, + "completions/min_length": 381.0, + "completions/min_terminated_length": 381.0, + "epoch": 0.094640434192673, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.8695447444915771, + "kl": 0.0, + "learning_rate": 4.6066252587991715e-07, + "loss": 0.0393, + "num_tokens": 9539867.0, + "reward": 0.5, + "reward_std": 0.21908903121948242, + "rewards/correctness_reward_func/mean": 0.20000000298023224, + "rewards/correctness_reward_func/std": 0.36181360483169556, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 279 + }, + { + "completion_length": 1017.5000305175781, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1355.0, + "completions/max_terminated_length": 1355.0, + "completions/mean_length": 1017.5, + "completions/mean_terminated_length": 1017.5, + "completions/min_length": 628.0, + "completions/min_terminated_length": 628.0, + "epoch": 0.09497964721845319, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.05056990310549736, + "kl": 0.0, + "learning_rate": 4.6048999309868875e-07, + "loss": -0.001, + "num_tokens": 9560039.0, + "reward": 1.183333396911621, + "reward_std": 0.040824808180332184, + "rewards/correctness_reward_func/mean": 0.8833333849906921, + "rewards/correctness_reward_func/std": 0.10298572480678558, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 280 + }, + { + "completion_length": 2585.0001220703125, + "completions/clipped_ratio": 0.16666666666666663, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 6297.0, + "completions/mean_length": 3683.166748046875, + "completions/mean_terminated_length": 3102.0, + "completions/min_length": 592.0, + "completions/min_terminated_length": 592.0, + "epoch": 0.09531886024423337, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.1040610745549202, + "kl": NaN, + "learning_rate": 4.6031746031746025e-07, + "loss": -0.0198, + "num_tokens": 9600029.0, + "reward": 0.6625000834465027, + "reward_std": 0.06274950504302979, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.26250001788139343, + "rewards/format_reward_func/std": 0.09323723614215851, + "step": 281 + }, + { + "completion_length": 2685.8333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5926.0, + "completions/max_terminated_length": 5926.0, + "completions/mean_length": 2685.83349609375, + "completions/mean_terminated_length": 2685.83349609375, + "completions/min_length": 1012.0, + "completions/min_terminated_length": 1012.0, + "epoch": 0.09565807327001356, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6141899824142456, + "kl": 0.0, + "learning_rate": 4.6014492753623186e-07, + "loss": -0.0061, + "num_tokens": 9644097.0, + "reward": 0.8875000476837158, + "reward_std": 0.2698235511779785, + "rewards/correctness_reward_func/mean": 0.5999999642372131, + "rewards/correctness_reward_func/std": 0.45126086473464966, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 282 + }, + { + "completion_length": 2492.5833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6501.0, + "completions/max_terminated_length": 6501.0, + "completions/mean_length": 2492.58349609375, + "completions/mean_terminated_length": 2492.58349609375, + "completions/min_length": 702.0, + "completions/min_terminated_length": 702.0, + "epoch": 0.09599728629579375, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.10037942975759506, + "kl": 0.0, + "learning_rate": 4.599723947550034e-07, + "loss": -0.0009, + "num_tokens": 9686560.0, + "reward": 1.1666667461395264, + "reward_std": 0.05163975805044174, + "rewards/correctness_reward_func/mean": 0.8666666150093079, + "rewards/correctness_reward_func/std": 0.0984731912612915, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 283 + }, + { + "completion_length": 1550.916748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3066.0, + "completions/max_terminated_length": 3066.0, + "completions/mean_length": 1550.916748046875, + "completions/mean_terminated_length": 1550.916748046875, + "completions/min_length": 456.0, + "completions/min_terminated_length": 456.0, + "epoch": 0.09633649932157395, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.05274030938744545, + "kl": 0.0, + "learning_rate": 4.59799861973775e-07, + "loss": 0.0002, + "num_tokens": 9716757.0, + "reward": 1.1875, + "reward_std": 0.030618613585829735, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.2875000238418579, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 284 + }, + { + "completion_length": 2718.08349609375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4146.0, + "completions/max_terminated_length": 4146.0, + "completions/mean_length": 2718.08349609375, + "completions/mean_terminated_length": 2718.08349609375, + "completions/min_length": 1100.0, + "completions/min_terminated_length": 1100.0, + "epoch": 0.09667571234735414, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.087214395403862, + "kl": 0.0, + "learning_rate": 4.596273291925465e-07, + "loss": -0.002, + "num_tokens": 9761284.0, + "reward": 1.183333396911621, + "reward_std": 0.040824808180332184, + "rewards/correctness_reward_func/mean": 0.8833333849906921, + "rewards/correctness_reward_func/std": 0.10298572480678558, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 285 + }, + { + "completion_length": 1447.2500610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2123.0, + "completions/max_terminated_length": 2123.0, + "completions/mean_length": 1447.25, + "completions/mean_terminated_length": 1447.25, + "completions/min_length": 954.0, + "completions/min_terminated_length": 954.0, + "epoch": 0.09701492537313433, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.683128113607381e-07, + "kl": 0.0, + "learning_rate": 4.594547964113181e-07, + "loss": 0.0, + "num_tokens": 9792013.0, + "reward": 1.1000001430511475, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 286 + }, + { + "completion_length": 1051.0833435058594, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2202.0, + "completions/max_terminated_length": 2202.0, + "completions/mean_length": 1051.0833740234375, + "completions/mean_terminated_length": 1051.0833740234375, + "completions/min_length": 503.0, + "completions/min_terminated_length": 503.0, + "epoch": 0.09735413839891452, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.0335989486520702e-07, + "kl": 0.0, + "learning_rate": 4.5928226363008973e-07, + "loss": 0.0, + "num_tokens": 9817568.0, + "reward": 1.2000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 287 + }, + { + "completion_length": 2563.8333740234375, + "completions/clipped_ratio": 0.25, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5924.0, + "completions/mean_length": 4211.08349609375, + "completions/mean_terminated_length": 3418.444580078125, + "completions/min_length": 1609.0, + "completions/min_terminated_length": 1609.0, + "epoch": 0.09769335142469471, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.8279024958610535, + "kl": NaN, + "learning_rate": 4.591097308488613e-07, + "loss": -0.0875, + "num_tokens": 9862596.0, + "reward": 0.8041667342185974, + "reward_std": 0.3116154074668884, + "rewards/correctness_reward_func/mean": 0.5666666626930237, + "rewards/correctness_reward_func/std": 0.42497774958610535, + "rewards/format_reward_func/mean": 0.23750001192092896, + "rewards/format_reward_func/std": 0.11894422769546509, + "step": 288 + }, + { + "completion_length": 3452.8333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5503.0, + "completions/max_terminated_length": 5503.0, + "completions/mean_length": 3452.83349609375, + "completions/mean_terminated_length": 3452.83349609375, + "completions/min_length": 2014.0, + "completions/min_terminated_length": 2014.0, + "epoch": 0.0980325644504749, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 4.5893719806763283e-07, + "loss": 0.0, + "num_tokens": 9915352.0, + "reward": 0.7999999523162842, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.5, + "rewards/correctness_reward_func/std": 0.5222329497337341, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 289 + }, + { + "completion_length": 1157.4166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1837.0, + "completions/max_terminated_length": 1837.0, + "completions/mean_length": 1157.416748046875, + "completions/mean_terminated_length": 1157.416748046875, + "completions/min_length": 640.0, + "completions/min_terminated_length": 640.0, + "epoch": 0.09837177747625508, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.06177419424057007, + "kl": 0.0, + "learning_rate": 4.587646652864044e-07, + "loss": 0.0001, + "num_tokens": 9939849.0, + "reward": 1.0875000953674316, + "reward_std": 0.030618613585829735, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 290 + }, + { + "completion_length": 595.2500305175781, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1038.0, + "completions/max_terminated_length": 1038.0, + "completions/mean_length": 595.25, + "completions/mean_terminated_length": 595.25, + "completions/min_length": 367.0, + "completions/min_terminated_length": 367.0, + "epoch": 0.09871099050203527, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08985080569982529, + "kl": 0.0, + "learning_rate": 4.58592132505176e-07, + "loss": 0.0007, + "num_tokens": 9958920.0, + "reward": 0.7583333849906921, + "reward_std": 0.07955463975667953, + "rewards/correctness_reward_func/mean": 0.4833333492279053, + "rewards/correctness_reward_func/std": 0.5078176856040955, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.05838742107152939, + "step": 291 + }, + { + "completion_length": 2860.666748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5107.0, + "completions/max_terminated_length": 5107.0, + "completions/mean_length": 2860.666748046875, + "completions/mean_terminated_length": 2860.666748046875, + "completions/min_length": 847.0, + "completions/min_terminated_length": 847.0, + "epoch": 0.09905020352781546, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6327357292175293, + "kl": 0.0, + "learning_rate": 4.584195997239475e-07, + "loss": -0.0131, + "num_tokens": 9998972.0, + "reward": 0.7666667699813843, + "reward_std": 0.36985844373703003, + "rewards/correctness_reward_func/mean": 0.46666669845581055, + "rewards/correctness_reward_func/std": 0.4119429290294647, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 292 + }, + { + "completion_length": 2198.8333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4135.0, + "completions/max_terminated_length": 4135.0, + "completions/mean_length": 2198.83349609375, + "completions/mean_terminated_length": 2198.83349609375, + "completions/min_length": 1079.0, + "completions/min_terminated_length": 1079.0, + "epoch": 0.09938941655359566, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14555297791957855, + "kl": 0.0, + "learning_rate": 4.582470669427191e-07, + "loss": -0.004, + "num_tokens": 10040952.0, + "reward": 1.1666667461395264, + "reward_std": 0.09559705853462219, + "rewards/correctness_reward_func/mean": 0.8666666150093079, + "rewards/correctness_reward_func/std": 0.0984731912612915, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 293 + }, + { + "completion_length": 1870.0833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5196.0, + "completions/max_terminated_length": 5196.0, + "completions/mean_length": 1870.0833740234375, + "completions/mean_terminated_length": 1870.0833740234375, + "completions/min_length": 573.0, + "completions/min_terminated_length": 573.0, + "epoch": 0.09972862957937585, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6394572257995605, + "kl": 0.0, + "learning_rate": 4.5807453416149065e-07, + "loss": 0.0048, + "num_tokens": 10071475.0, + "reward": 0.7833334803581238, + "reward_std": 0.20412413775920868, + "rewards/correctness_reward_func/mean": 0.4833333194255829, + "rewards/correctness_reward_func/std": 0.43029236793518066, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 294 + }, + { + "completion_length": 2610.9166870117188, + "completions/clipped_ratio": 0.16666666666666663, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5046.0, + "completions/mean_length": 3709.08349609375, + "completions/mean_terminated_length": 3133.10009765625, + "completions/min_length": 1825.0, + "completions/min_terminated_length": 1825.0, + "epoch": 0.10006784260515604, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9850728511810303, + "kl": NaN, + "learning_rate": 4.5790200138026226e-07, + "loss": -0.0765, + "num_tokens": 10118868.0, + "reward": 0.8333333730697632, + "reward_std": 0.5316232442855835, + "rewards/correctness_reward_func/mean": 0.5833333134651184, + "rewards/correctness_reward_func/std": 0.4386619031429291, + "rewards/format_reward_func/mean": 0.2500000298023224, + "rewards/format_reward_func/std": 0.11677484214305878, + "step": 295 + }, + { + "completion_length": 3167.666748046875, + "completions/clipped_ratio": 0.25, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 6466.0, + "completions/mean_length": 4814.9169921875, + "completions/mean_terminated_length": 4223.5556640625, + "completions/min_length": 1487.0, + "completions/min_terminated_length": 1487.0, + "epoch": 0.10040705563093623, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.3844717741012573, + "kl": NaN, + "learning_rate": 4.5772946859903376e-07, + "loss": -0.0294, + "num_tokens": 10169294.0, + "reward": 0.6375000476837158, + "reward_std": 0.07373940199613571, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.23750001192092896, + "rewards/format_reward_func/std": 0.11894422769546509, + "step": 296 + }, + { + "completion_length": 2613.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4815.0, + "completions/max_terminated_length": 4815.0, + "completions/mean_length": 2613.0, + "completions/mean_terminated_length": 2613.0, + "completions/min_length": 1343.0, + "completions/min_terminated_length": 1343.0, + "epoch": 0.10074626865671642, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.12785717844963074, + "kl": 0.0, + "learning_rate": 4.5755693581780536e-07, + "loss": 0.0003, + "num_tokens": 10213808.0, + "reward": 1.183333396911621, + "reward_std": 0.040824808180332184, + "rewards/correctness_reward_func/mean": 0.8833333849906921, + "rewards/correctness_reward_func/std": 0.10298572480678558, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 297 + }, + { + "completion_length": 1011.9167175292969, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1775.0, + "completions/max_terminated_length": 1775.0, + "completions/mean_length": 1011.9166870117188, + "completions/mean_terminated_length": 1011.9166870117188, + "completions/min_length": 474.0, + "completions/min_terminated_length": 474.0, + "epoch": 0.1010854816824966, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.48920974135398865, + "kl": 0.0, + "learning_rate": 4.573844030365769e-07, + "loss": -0.0006, + "num_tokens": 10236667.0, + "reward": 1.2041666507720947, + "reward_std": 0.2002602517604828, + "rewards/correctness_reward_func/mean": 0.9166666865348816, + "rewards/correctness_reward_func/std": 0.28867512941360474, + "rewards/format_reward_func/mean": 0.2875000238418579, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 298 + }, + { + "completion_length": 3067.5001220703125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6274.0, + "completions/max_terminated_length": 6274.0, + "completions/mean_length": 3067.5, + "completions/mean_terminated_length": 3067.5, + "completions/min_length": 1115.0, + "completions/min_terminated_length": 1115.0, + "epoch": 0.1014246947082768, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.07078174501657486, + "kl": 0.0, + "learning_rate": 4.572118702553485e-07, + "loss": -0.0003, + "num_tokens": 10285699.0, + "reward": 1.183333396911621, + "reward_std": 0.040824808180332184, + "rewards/correctness_reward_func/mean": 0.8833333849906921, + "rewards/correctness_reward_func/std": 0.10298572480678558, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 299 + }, + { + "completion_length": 2652.2501220703125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4168.0, + "completions/max_terminated_length": 4168.0, + "completions/mean_length": 2652.25, + "completions/mean_terminated_length": 2652.25, + "completions/min_length": 1339.0, + "completions/min_terminated_length": 1339.0, + "epoch": 0.10176390773405698, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.06933386623859406, + "kl": 0.0, + "learning_rate": 4.5703933747412e-07, + "loss": -0.0005, + "num_tokens": 10330456.0, + "reward": 0.6750000715255737, + "reward_std": 0.03872981667518616, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.05838742107152939, + "step": 300 + }, + { + "completion_length": 489.00001525878906, + "completions/clipped_ratio": 0.5, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 1186.0, + "completions/mean_length": 3783.5, + "completions/mean_terminated_length": 978.0, + "completions/min_length": 566.0, + "completions/min_terminated_length": 566.0, + "epoch": 0.10210312075983717, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": NaN, + "learning_rate": 4.568668046928916e-07, + "loss": 0.0, + "num_tokens": 10348420.0, + "reward": 0.15000000596046448, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.0, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.15000000596046448, + "rewards/format_reward_func/std": 0.15666989982128143, + "step": 301 + }, + { + "completion_length": 3680.3333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5165.0, + "completions/max_terminated_length": 5165.0, + "completions/mean_length": 3680.33349609375, + "completions/mean_terminated_length": 3680.33349609375, + "completions/min_length": 1925.0, + "completions/min_terminated_length": 1925.0, + "epoch": 0.10244233378561737, + "frac_reward_zero_std": 1.0, + "grad_norm": 4.457709223970596e-07, + "kl": 0.0, + "learning_rate": 4.5669427191166323e-07, + "loss": 0.0, + "num_tokens": 10402940.0, + "reward": 0.7000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 302 + }, + { + "completion_length": 1168.0000610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2076.0, + "completions/max_terminated_length": 2076.0, + "completions/mean_length": 1168.0, + "completions/mean_terminated_length": 1168.0, + "completions/min_length": 620.0, + "completions/min_terminated_length": 620.0, + "epoch": 0.10278154681139756, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5187942981719971, + "kl": 0.0, + "learning_rate": 4.5652173913043473e-07, + "loss": -0.0028, + "num_tokens": 10434620.0, + "reward": 1.0375001430511475, + "reward_std": 0.2883797585964203, + "rewards/correctness_reward_func/mean": 0.75, + "rewards/correctness_reward_func/std": 0.36306774616241455, + "rewards/format_reward_func/mean": 0.2875000238418579, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 303 + }, + { + "completion_length": 1859.9166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3710.0, + "completions/max_terminated_length": 3710.0, + "completions/mean_length": 1859.916748046875, + "completions/mean_terminated_length": 1859.916748046875, + "completions/min_length": 541.0, + "completions/min_terminated_length": 541.0, + "epoch": 0.10312075983717775, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.17270970344543457, + "kl": 0.0, + "learning_rate": 4.5634920634920634e-07, + "loss": -0.0001, + "num_tokens": 10468945.0, + "reward": 1.1666667461395264, + "reward_std": 0.051639750599861145, + "rewards/correctness_reward_func/mean": 0.8666666150093079, + "rewards/correctness_reward_func/std": 0.0984731912612915, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 304 + }, + { + "completion_length": 2067.166748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3797.0, + "completions/max_terminated_length": 3797.0, + "completions/mean_length": 2067.166748046875, + "completions/mean_terminated_length": 2067.166748046875, + "completions/min_length": 720.0, + "completions/min_terminated_length": 720.0, + "epoch": 0.10345997286295794, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.4897084927033575e-07, + "kl": 0.0, + "learning_rate": 4.561766735679779e-07, + "loss": 0.0, + "num_tokens": 10505637.0, + "reward": 1.1000001430511475, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 305 + }, + { + "completion_length": 769.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1071.0, + "completions/max_terminated_length": 1071.0, + "completions/mean_length": 769.25, + "completions/mean_terminated_length": 769.25, + "completions/min_length": 633.0, + "completions/min_terminated_length": 633.0, + "epoch": 0.10379918588873813, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.078951990204587e-08, + "kl": 0.0, + "learning_rate": 4.560041407867495e-07, + "loss": 0.0, + "num_tokens": 10528476.0, + "reward": 0.7000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 306 + }, + { + "completion_length": 2807.5833740234375, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 4502.0, + "completions/mean_length": 3356.666748046875, + "completions/mean_terminated_length": 3062.818359375, + "completions/min_length": 1536.0, + "completions/min_terminated_length": 1536.0, + "epoch": 0.10413839891451832, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.09662654250860214, + "kl": NaN, + "learning_rate": 4.55831608005521e-07, + "loss": -0.0059, + "num_tokens": 10573567.0, + "reward": 0.6875001788139343, + "reward_std": 0.03061862289905548, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 307 + }, + { + "completion_length": 971.2500305175781, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1409.0, + "completions/max_terminated_length": 1409.0, + "completions/mean_length": 971.25, + "completions/mean_terminated_length": 971.25, + "completions/min_length": 642.0, + "completions/min_terminated_length": 642.0, + "epoch": 0.1044776119402985, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.07163669914007187, + "kl": 0.0, + "learning_rate": 4.556590752242926e-07, + "loss": 0.0004, + "num_tokens": 10592332.0, + "reward": 1.1500000953674316, + "reward_std": 0.05477223917841911, + "rewards/correctness_reward_func/mean": 0.8500000834465027, + "rewards/correctness_reward_func/std": 0.09045339375734329, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 308 + }, + { + "completion_length": 3025.5833740234375, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5822.0, + "completions/mean_length": 3574.666748046875, + "completions/mean_terminated_length": 3300.636474609375, + "completions/min_length": 919.0, + "completions/min_terminated_length": 919.0, + "epoch": 0.10481682496607869, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.5091751217842102, + "kl": NaN, + "learning_rate": 4.5548654244306415e-07, + "loss": -0.0043, + "num_tokens": 10640033.0, + "reward": 0.770833432674408, + "reward_std": 0.2123773992061615, + "rewards/correctness_reward_func/mean": 0.4833333194255829, + "rewards/correctness_reward_func/std": 0.43029236793518066, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 309 + }, + { + "completion_length": 1609.916748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2961.0, + "completions/max_terminated_length": 2961.0, + "completions/mean_length": 1609.916748046875, + "completions/mean_terminated_length": 1609.916748046875, + "completions/min_length": 465.0, + "completions/min_terminated_length": 465.0, + "epoch": 0.10515603799185888, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.07517128437757492, + "kl": 0.0, + "learning_rate": 4.5531400966183576e-07, + "loss": -0.0007, + "num_tokens": 10670926.0, + "reward": 1.183333396911621, + "reward_std": 0.040824808180332184, + "rewards/correctness_reward_func/mean": 0.8833333849906921, + "rewards/correctness_reward_func/std": 0.10298572480678558, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 310 + }, + { + "completion_length": 3589.7501220703125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4867.0, + "completions/max_terminated_length": 4867.0, + "completions/mean_length": 3589.75, + "completions/mean_terminated_length": 3589.75, + "completions/min_length": 2861.0, + "completions/min_terminated_length": 2861.0, + "epoch": 0.10549525101763908, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1443360447883606, + "kl": 0.0, + "learning_rate": 4.5514147688060726e-07, + "loss": -0.0008, + "num_tokens": 10727071.0, + "reward": 0.7041667699813843, + "reward_std": 0.07144345343112946, + "rewards/correctness_reward_func/mean": 0.4166666567325592, + "rewards/correctness_reward_func/std": 0.4386619031429291, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 311 + }, + { + "completion_length": 1537.3333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4027.0, + "completions/max_terminated_length": 4027.0, + "completions/mean_length": 1537.3333740234375, + "completions/mean_terminated_length": 1537.3333740234375, + "completions/min_length": 811.0, + "completions/min_terminated_length": 811.0, + "epoch": 0.10583446404341927, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.42037126421928406, + "kl": 0.0, + "learning_rate": 4.5496894409937887e-07, + "loss": 0.0139, + "num_tokens": 10761491.0, + "reward": 1.1666667461395264, + "reward_std": 0.2588964104652405, + "rewards/correctness_reward_func/mean": 0.8666666150093079, + "rewards/correctness_reward_func/std": 0.287096232175827, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 312 + }, + { + "completion_length": 2531.7501220703125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4342.0, + "completions/max_terminated_length": 4342.0, + "completions/mean_length": 2531.75, + "completions/mean_terminated_length": 2531.75, + "completions/min_length": 874.0, + "completions/min_terminated_length": 874.0, + "epoch": 0.10617367706919946, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.324536924585118e-07, + "kl": 0.0, + "learning_rate": 4.547964113181504e-07, + "loss": 0.0, + "num_tokens": 10803350.0, + "reward": 0.7000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 313 + }, + { + "completion_length": 3405.416748046875, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 6562.0, + "completions/mean_length": 3954.5, + "completions/mean_terminated_length": 3715.0, + "completions/min_length": 1611.0, + "completions/min_terminated_length": 1611.0, + "epoch": 0.10651289009497965, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.647424578666687, + "kl": NaN, + "learning_rate": 4.5462387853692197e-07, + "loss": -0.0113, + "num_tokens": 10858477.0, + "reward": 0.6791666746139526, + "reward_std": 0.2734726071357727, + "rewards/correctness_reward_func/mean": 0.4166666567325592, + "rewards/correctness_reward_func/std": 0.5149286985397339, + "rewards/format_reward_func/mean": 0.26250001788139343, + "rewards/format_reward_func/std": 0.06784005463123322, + "step": 314 + }, + { + "completion_length": 2428.166748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5417.0, + "completions/max_terminated_length": 5417.0, + "completions/mean_length": 2428.166748046875, + "completions/mean_terminated_length": 2428.166748046875, + "completions/min_length": 1073.0, + "completions/min_terminated_length": 1073.0, + "epoch": 0.10685210312075984, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5885283350944519, + "kl": 0.0, + "learning_rate": 4.544513457556935e-07, + "loss": 0.023, + "num_tokens": 10903773.0, + "reward": 1.1166667938232422, + "reward_std": 0.24571877717971802, + "rewards/correctness_reward_func/mean": 0.8166666626930237, + "rewards/correctness_reward_func/std": 0.27579089999198914, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 315 + }, + { + "completion_length": 3376.166748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6365.0, + "completions/max_terminated_length": 6365.0, + "completions/mean_length": 3376.166748046875, + "completions/mean_terminated_length": 3376.166748046875, + "completions/min_length": 1547.0, + "completions/min_terminated_length": 1547.0, + "epoch": 0.10719131614654002, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.15940889716148376, + "kl": 0.0, + "learning_rate": 4.5427881297446513e-07, + "loss": -0.0027, + "num_tokens": 10957691.0, + "reward": 1.183333396911621, + "reward_std": 0.040824808180332184, + "rewards/correctness_reward_func/mean": 0.8833333849906921, + "rewards/correctness_reward_func/std": 0.10298572480678558, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 316 + }, + { + "completion_length": 1784.8333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4751.0, + "completions/max_terminated_length": 4751.0, + "completions/mean_length": 1784.8333740234375, + "completions/mean_terminated_length": 1784.8333740234375, + "completions/min_length": 388.0, + "completions/min_terminated_length": 388.0, + "epoch": 0.10753052917232021, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.41797852516174316, + "kl": 0.0, + "learning_rate": 4.5410628019323674e-07, + "loss": 0.0045, + "num_tokens": 10988949.0, + "reward": 1.0333335399627686, + "reward_std": 0.1632993221282959, + "rewards/correctness_reward_func/mean": 0.7333333492279053, + "rewards/correctness_reward_func/std": 0.23094011843204498, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 317 + }, + { + "completion_length": 1927.2501220703125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3727.0, + "completions/max_terminated_length": 3727.0, + "completions/mean_length": 1927.25, + "completions/mean_terminated_length": 1927.25, + "completions/min_length": 823.0, + "completions/min_terminated_length": 823.0, + "epoch": 0.1078697421981004, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.07680307328701019, + "kl": 0.0, + "learning_rate": 4.5393374741200824e-07, + "loss": -0.0008, + "num_tokens": 11024550.0, + "reward": 1.1375000476837158, + "reward_std": 0.030618613585829735, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.23750001192092896, + "rewards/format_reward_func/std": 0.07723929733037949, + "step": 318 + }, + { + "completion_length": 954.4166717529297, + "completions/clipped_ratio": 0.41666666666666663, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5713.0, + "completions/mean_length": 3699.83349609375, + "completions/mean_terminated_length": 1636.1429443359375, + "completions/min_length": 681.0, + "completions/min_terminated_length": 681.0, + "epoch": 0.10820895522388059, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2716827988624573, + "kl": NaN, + "learning_rate": 4.5376121463077984e-07, + "loss": -0.0151, + "num_tokens": 11048717.0, + "reward": 0.6833333373069763, + "reward_std": 0.10206204652786255, + "rewards/correctness_reward_func/mean": 0.4833333492279053, + "rewards/correctness_reward_func/std": 0.5078176856040955, + "rewards/format_reward_func/mean": 0.20000000298023224, + "rewards/format_reward_func/std": 0.133143812417984, + "step": 319 + }, + { + "completion_length": 2118.5834350585938, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6177.0, + "completions/max_terminated_length": 6177.0, + "completions/mean_length": 2118.58349609375, + "completions/mean_terminated_length": 2118.58349609375, + "completions/min_length": 778.0, + "completions/min_terminated_length": 778.0, + "epoch": 0.10854816824966079, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16588005423545837, + "kl": 0.0, + "learning_rate": 4.535886818495514e-07, + "loss": 0.0046, + "num_tokens": 11085858.0, + "reward": 1.1666667461395264, + "reward_std": 0.09559707343578339, + "rewards/correctness_reward_func/mean": 0.8666666150093079, + "rewards/correctness_reward_func/std": 0.0984731912612915, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 320 + }, + { + "completion_length": 3266.5001220703125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4822.0, + "completions/max_terminated_length": 4822.0, + "completions/mean_length": 3266.5, + "completions/mean_terminated_length": 3266.5, + "completions/min_length": 2462.0, + "completions/min_terminated_length": 2462.0, + "epoch": 0.10888738127544098, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.137014701962471, + "kl": 0.0, + "learning_rate": 4.53416149068323e-07, + "loss": 0.0032, + "num_tokens": 11136462.0, + "reward": 0.75, + "reward_std": 0.05477223917841911, + "rewards/correctness_reward_func/mean": 0.44999995827674866, + "rewards/correctness_reward_func/std": 0.47577688097953796, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 321 + }, + { + "completion_length": 2219.0000610351562, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5317.0, + "completions/mean_length": 2768.08349609375, + "completions/mean_terminated_length": 2420.727294921875, + "completions/min_length": 506.0, + "completions/min_terminated_length": 506.0, + "epoch": 0.10922659430122117, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0347485542297363, + "kl": NaN, + "learning_rate": 4.532436162870945e-07, + "loss": -0.049, + "num_tokens": 11172702.0, + "reward": 0.9375001192092896, + "reward_std": 0.28885549306869507, + "rewards/correctness_reward_func/mean": 0.6499999761581421, + "rewards/correctness_reward_func/std": 0.40113475918769836, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 322 + }, + { + "completion_length": 1779.7500610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3227.0, + "completions/max_terminated_length": 3227.0, + "completions/mean_length": 1779.75, + "completions/mean_terminated_length": 1779.75, + "completions/min_length": 648.0, + "completions/min_terminated_length": 648.0, + "epoch": 0.10956580732700136, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.34098246693611145, + "kl": 0.0, + "learning_rate": 4.530710835058661e-07, + "loss": -0.0022, + "num_tokens": 11204241.0, + "reward": 0.36666667461395264, + "reward_std": 0.1632993221282959, + "rewards/correctness_reward_func/mean": 0.06666667014360428, + "rewards/correctness_reward_func/std": 0.23094011843204498, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 323 + }, + { + "completion_length": 2422.7501220703125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4466.0, + "completions/max_terminated_length": 4466.0, + "completions/mean_length": 2422.75, + "completions/mean_terminated_length": 2422.75, + "completions/min_length": 853.0, + "completions/min_terminated_length": 853.0, + "epoch": 0.10990502035278155, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.12155548483133316, + "kl": 0.0, + "learning_rate": 4.5289855072463766e-07, + "loss": 0.0057, + "num_tokens": 11242524.0, + "reward": 0.75, + "reward_std": 0.05477223917841911, + "rewards/correctness_reward_func/mean": 0.45000001788139343, + "rewards/correctness_reward_func/std": 0.47577688097953796, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 324 + }, + { + "completion_length": 1740.2500610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2459.0, + "completions/max_terminated_length": 2459.0, + "completions/mean_length": 1740.25, + "completions/mean_terminated_length": 1740.25, + "completions/min_length": 1336.0, + "completions/min_terminated_length": 1336.0, + "epoch": 0.11024423337856173, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.1897661522652925e-07, + "kl": 0.0, + "learning_rate": 4.527260179434092e-07, + "loss": 0.0, + "num_tokens": 11277657.0, + "reward": 1.2000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 325 + }, + { + "completion_length": 1775.8333435058594, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5753.0, + "completions/max_terminated_length": 5753.0, + "completions/mean_length": 1775.8333740234375, + "completions/mean_terminated_length": 1775.8333740234375, + "completions/min_length": 460.0, + "completions/min_terminated_length": 460.0, + "epoch": 0.11058344640434192, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16949614882469177, + "kl": 0.0, + "learning_rate": 4.5255348516218076e-07, + "loss": 0.0069, + "num_tokens": 11312557.0, + "reward": 1.1875, + "reward_std": 0.09653984010219574, + "rewards/correctness_reward_func/mean": 0.9000000357627869, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 326 + }, + { + "completion_length": 930.0833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2166.0, + "completions/max_terminated_length": 2166.0, + "completions/mean_length": 930.0833740234375, + "completions/mean_terminated_length": 930.0833740234375, + "completions/min_length": 455.0, + "completions/min_terminated_length": 455.0, + "epoch": 0.11092265943012211, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2900066673755646, + "kl": 0.0, + "learning_rate": 4.5238095238095237e-07, + "loss": -0.0033, + "num_tokens": 11334668.0, + "reward": 1.0375001430511475, + "reward_std": 0.23474279046058655, + "rewards/correctness_reward_func/mean": 0.75, + "rewards/correctness_reward_func/std": 0.24308621883392334, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 327 + }, + { + "completion_length": 2132.3333740234375, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 4965.0, + "completions/mean_length": 2681.416748046875, + "completions/mean_terminated_length": 2326.181884765625, + "completions/min_length": 779.0, + "completions/min_terminated_length": 779.0, + "epoch": 0.1112618724559023, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.5346319079399109, + "kl": NaN, + "learning_rate": 4.522084195997239e-07, + "loss": -0.0373, + "num_tokens": 11372550.0, + "reward": 1.008333444595337, + "reward_std": 0.22453658282756805, + "rewards/correctness_reward_func/mean": 0.7333333492279053, + "rewards/correctness_reward_func/std": 0.23094011843204498, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.08660253882408142, + "step": 328 + }, + { + "completion_length": 1133.2500305175781, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3432.0, + "completions/max_terminated_length": 3432.0, + "completions/mean_length": 1133.25, + "completions/mean_terminated_length": 1133.25, + "completions/min_length": 560.0, + "completions/min_terminated_length": 560.0, + "epoch": 0.1116010854816825, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.394129978711135e-07, + "kl": 0.0, + "learning_rate": 4.520358868184955e-07, + "loss": 0.0, + "num_tokens": 11402877.0, + "reward": 1.1000001430511475, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 329 + }, + { + "completion_length": 3503.916748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4891.0, + "completions/max_terminated_length": 4891.0, + "completions/mean_length": 3503.916748046875, + "completions/mean_terminated_length": 3503.916748046875, + "completions/min_length": 2531.0, + "completions/min_terminated_length": 2531.0, + "epoch": 0.11194029850746269, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.5139081478118896, + "kl": 0.0, + "learning_rate": 4.5186335403726703e-07, + "loss": 0.0014, + "num_tokens": 11455586.0, + "reward": 1.0333335399627686, + "reward_std": 0.1632993221282959, + "rewards/correctness_reward_func/mean": 0.7333333492279053, + "rewards/correctness_reward_func/std": 0.23094011843204498, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 330 + }, + { + "completion_length": 752.1666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1077.0, + "completions/max_terminated_length": 1077.0, + "completions/mean_length": 752.1666870117188, + "completions/mean_terminated_length": 752.1666870117188, + "completions/min_length": 545.0, + "completions/min_terminated_length": 545.0, + "epoch": 0.11227951153324288, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.2810387909412384, + "kl": 0.0, + "learning_rate": 4.5169082125603863e-07, + "loss": -0.0016, + "num_tokens": 11472094.0, + "reward": 1.0208334922790527, + "reward_std": 0.19391795992851257, + "rewards/correctness_reward_func/mean": 0.7333333492279053, + "rewards/correctness_reward_func/std": 0.23094011843204498, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 331 + }, + { + "completion_length": 3109.666748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6303.0, + "completions/max_terminated_length": 6303.0, + "completions/mean_length": 3109.666748046875, + "completions/mean_terminated_length": 3109.666748046875, + "completions/min_length": 1302.0, + "completions/min_terminated_length": 1302.0, + "epoch": 0.11261872455902307, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.49435505270957947, + "kl": 0.0, + "learning_rate": 4.5151828847481024e-07, + "loss": -0.0082, + "num_tokens": 11521260.0, + "reward": 1.0333335399627686, + "reward_std": 0.1632993221282959, + "rewards/correctness_reward_func/mean": 0.7333333492279053, + "rewards/correctness_reward_func/std": 0.23094011843204498, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 332 + }, + { + "completion_length": 2569.25, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5876.0, + "completions/mean_length": 3118.33349609375, + "completions/mean_terminated_length": 2802.818359375, + "completions/min_length": 704.0, + "completions/min_terminated_length": 704.0, + "epoch": 0.11295793758480326, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2554776668548584, + "kl": NaN, + "learning_rate": 4.5134575569358174e-07, + "loss": -0.0563, + "num_tokens": 11561421.0, + "reward": 0.5125000476837158, + "reward_std": 0.3184925317764282, + "rewards/correctness_reward_func/mean": 0.25, + "rewards/correctness_reward_func/std": 0.45226702094078064, + "rewards/format_reward_func/mean": 0.26250001788139343, + "rewards/format_reward_func/std": 0.09323723614215851, + "step": 333 + }, + { + "completion_length": 1469.9166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2300.0, + "completions/max_terminated_length": 2300.0, + "completions/mean_length": 1469.916748046875, + "completions/mean_terminated_length": 1469.916748046875, + "completions/min_length": 763.0, + "completions/min_terminated_length": 763.0, + "epoch": 0.11329715061058344, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.08878318965435028, + "kl": 0.0, + "learning_rate": 4.5117322291235335e-07, + "loss": 0.0007, + "num_tokens": 11590916.0, + "reward": 0.7833333015441895, + "reward_std": 0.040824808180332184, + "rewards/correctness_reward_func/mean": 0.4833333492279053, + "rewards/correctness_reward_func/std": 0.5078176856040955, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 334 + }, + { + "completion_length": 2060.3333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3121.0, + "completions/max_terminated_length": 3121.0, + "completions/mean_length": 2060.33349609375, + "completions/mean_terminated_length": 2060.33349609375, + "completions/min_length": 733.0, + "completions/min_terminated_length": 733.0, + "epoch": 0.11363636363636363, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 4.510006901311249e-07, + "loss": 0.0, + "num_tokens": 11627238.0, + "reward": 0.30000001192092896, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.0, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 335 + }, + { + "completion_length": 2561.0001220703125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3956.0, + "completions/max_terminated_length": 3956.0, + "completions/mean_length": 2561.0, + "completions/mean_terminated_length": 2561.0, + "completions/min_length": 1604.0, + "completions/min_terminated_length": 1604.0, + "epoch": 0.11397557666214382, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5772103071212769, + "kl": 0.0, + "learning_rate": 4.508281573498965e-07, + "loss": 0.0067, + "num_tokens": 11666136.0, + "reward": 1.1500000953674316, + "reward_std": 0.2557638883590698, + "rewards/correctness_reward_func/mean": 0.8499999642372131, + "rewards/correctness_reward_func/std": 0.28444522619247437, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 336 + }, + { + "completion_length": 1338.5833740234375, + "completions/clipped_ratio": 0.25, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 6187.0, + "completions/mean_length": 2985.83349609375, + "completions/mean_terminated_length": 1784.77783203125, + "completions/min_length": 672.0, + "completions/min_terminated_length": 672.0, + "epoch": 0.11431478968792401, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.14361168444156647, + "kl": NaN, + "learning_rate": 4.50655624568668e-07, + "loss": -0.0182, + "num_tokens": 11693431.0, + "reward": 0.6250001192092896, + "reward_std": 0.08215838670730591, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.22500000894069672, + "rewards/format_reward_func/std": 0.13568010926246643, + "step": 337 + }, + { + "completion_length": 1112.6666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3285.0, + "completions/max_terminated_length": 3285.0, + "completions/mean_length": 1112.666748046875, + "completions/mean_terminated_length": 1112.666748046875, + "completions/min_length": 385.0, + "completions/min_terminated_length": 385.0, + "epoch": 0.11465400271370421, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.3458263874053955, + "kl": 0.0, + "learning_rate": 4.504830917874396e-07, + "loss": -0.0033, + "num_tokens": 11719575.0, + "reward": 1.0375001430511475, + "reward_std": 0.17446348071098328, + "rewards/correctness_reward_func/mean": 0.75, + "rewards/correctness_reward_func/std": 0.24308621883392334, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 338 + }, + { + "completion_length": 2146.5833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5674.0, + "completions/max_terminated_length": 5674.0, + "completions/mean_length": 2146.58349609375, + "completions/mean_terminated_length": 2146.58349609375, + "completions/min_length": 412.0, + "completions/min_terminated_length": 412.0, + "epoch": 0.1149932157394844, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5401544570922852, + "kl": 0.0, + "learning_rate": 4.5031055900621116e-07, + "loss": 0.0189, + "num_tokens": 11756044.0, + "reward": 0.6708333492279053, + "reward_std": 0.22469764947891235, + "rewards/correctness_reward_func/mean": 0.38333332538604736, + "rewards/correctness_reward_func/std": 0.4783177673816681, + "rewards/format_reward_func/mean": 0.2875000238418579, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 339 + }, + { + "completion_length": 2249.166748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4573.0, + "completions/max_terminated_length": 4573.0, + "completions/mean_length": 2249.166748046875, + "completions/mean_terminated_length": 2249.166748046875, + "completions/min_length": 1230.0, + "completions/min_terminated_length": 1230.0, + "epoch": 0.11533242876526459, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.08829537034034729, + "kl": 0.0, + "learning_rate": 4.501380262249827e-07, + "loss": -0.0025, + "num_tokens": 11795994.0, + "reward": 0.7833333015441895, + "reward_std": 0.040824808180332184, + "rewards/correctness_reward_func/mean": 0.4833333492279053, + "rewards/correctness_reward_func/std": 0.5078176856040955, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 340 + }, + { + "completion_length": 2166.0000610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4724.0, + "completions/max_terminated_length": 4724.0, + "completions/mean_length": 2166.0, + "completions/mean_terminated_length": 2166.0, + "completions/min_length": 693.0, + "completions/min_terminated_length": 693.0, + "epoch": 0.11567164179104478, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.09921236336231232, + "kl": 0.0, + "learning_rate": 4.4996549344375427e-07, + "loss": 0.0037, + "num_tokens": 11834874.0, + "reward": 1.133333444595337, + "reward_std": 0.051639772951602936, + "rewards/correctness_reward_func/mean": 0.8333333134651184, + "rewards/correctness_reward_func/std": 0.07784988731145859, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 341 + }, + { + "completion_length": 1815.8333740234375, + "completions/clipped_ratio": 0.25, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5702.0, + "completions/mean_length": 3463.08349609375, + "completions/mean_terminated_length": 2421.111083984375, + "completions/min_length": 956.0, + "completions/min_terminated_length": 956.0, + "epoch": 0.11601085481682497, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.7487086653709412, + "kl": NaN, + "learning_rate": 4.497929606625259e-07, + "loss": -0.0992, + "num_tokens": 11865634.0, + "reward": 0.9750000238418579, + "reward_std": 0.35601967573165894, + "rewards/correctness_reward_func/mean": 0.75, + "rewards/correctness_reward_func/std": 0.45226702094078064, + "rewards/format_reward_func/mean": 0.22500000894069672, + "rewards/format_reward_func/std": 0.13568010926246643, + "step": 342 + }, + { + "completion_length": 1490.2500610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4809.0, + "completions/max_terminated_length": 4809.0, + "completions/mean_length": 1490.25, + "completions/mean_terminated_length": 1490.25, + "completions/min_length": 912.0, + "completions/min_terminated_length": 912.0, + "epoch": 0.11635006784260515, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.07796503603458405, + "kl": 0.0, + "learning_rate": 4.4962042788129743e-07, + "loss": 0.0001, + "num_tokens": 11891383.0, + "reward": 1.2833333015441895, + "reward_std": 0.040824808180332184, + "rewards/correctness_reward_func/mean": 0.9833333492279053, + "rewards/correctness_reward_func/std": 0.05773502215743065, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 343 + }, + { + "completion_length": 1805.3333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3139.0, + "completions/max_terminated_length": 3139.0, + "completions/mean_length": 1805.3333740234375, + "completions/mean_terminated_length": 1805.3333740234375, + "completions/min_length": 947.0, + "completions/min_terminated_length": 947.0, + "epoch": 0.11668928086838534, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.4056276970441104e-07, + "kl": 0.0, + "learning_rate": 4.49447895100069e-07, + "loss": 0.0, + "num_tokens": 11928323.0, + "reward": 1.1000001430511475, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 344 + }, + { + "completion_length": 2549.83349609375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4606.0, + "completions/max_terminated_length": 4606.0, + "completions/mean_length": 2549.83349609375, + "completions/mean_terminated_length": 2549.83349609375, + "completions/min_length": 779.0, + "completions/min_terminated_length": 779.0, + "epoch": 0.11702849389416553, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6675385236740112, + "kl": 0.0, + "learning_rate": 4.4927536231884053e-07, + "loss": 0.0182, + "num_tokens": 11969691.0, + "reward": 1.0500000715255737, + "reward_std": 0.20412415266036987, + "rewards/correctness_reward_func/mean": 0.75, + "rewards/correctness_reward_func/std": 0.24308621883392334, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 345 + }, + { + "completion_length": 3366.666748046875, + "completions/clipped_ratio": 0.25, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 6368.0, + "completions/mean_length": 5013.9169921875, + "completions/mean_terminated_length": 4488.88916015625, + "completions/min_length": 1588.0, + "completions/min_terminated_length": 1588.0, + "epoch": 0.11736770691994572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.836334228515625, + "kl": NaN, + "learning_rate": 4.4910282953761214e-07, + "loss": -0.0777, + "num_tokens": 12023327.0, + "reward": 0.6041667461395264, + "reward_std": 0.29333966970443726, + "rewards/correctness_reward_func/mean": 0.36666667461395264, + "rewards/correctness_reward_func/std": 0.45792683959007263, + "rewards/format_reward_func/mean": 0.23750001192092896, + "rewards/format_reward_func/std": 0.11894422769546509, + "step": 346 + }, + { + "completion_length": 1315.916748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2489.0, + "completions/max_terminated_length": 2489.0, + "completions/mean_length": 1315.916748046875, + "completions/mean_terminated_length": 1315.916748046875, + "completions/min_length": 523.0, + "completions/min_terminated_length": 523.0, + "epoch": 0.11770691994572592, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.1746159006852395e-07, + "kl": 0.0, + "learning_rate": 4.4893029675638374e-07, + "loss": 0.0, + "num_tokens": 12052486.0, + "reward": 1.2000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 347 + }, + { + "completion_length": 3217.666748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6038.0, + "completions/max_terminated_length": 6038.0, + "completions/mean_length": 3217.666748046875, + "completions/mean_terminated_length": 3217.666748046875, + "completions/min_length": 1073.0, + "completions/min_terminated_length": 1073.0, + "epoch": 0.11804613297150611, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5589861273765564, + "kl": 0.0, + "learning_rate": 4.4875776397515524e-07, + "loss": -0.004, + "num_tokens": 12104628.0, + "reward": 1.0875000953674316, + "reward_std": 0.2607758939266205, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.2696799635887146, + "rewards/format_reward_func/mean": 0.2875000238418579, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 348 + }, + { + "completion_length": 1836.5834350585938, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5143.0, + "completions/max_terminated_length": 5143.0, + "completions/mean_length": 1836.5833740234375, + "completions/mean_terminated_length": 1836.5833740234375, + "completions/min_length": 686.0, + "completions/min_terminated_length": 686.0, + "epoch": 0.1183853459972863, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.10859017074108124, + "kl": 0.0, + "learning_rate": 4.4858523119392685e-07, + "loss": 0.0043, + "num_tokens": 12135391.0, + "reward": 1.0875000953674316, + "reward_std": 0.030618613585829735, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.2875000238418579, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 349 + }, + { + "completion_length": 2148.0834350585938, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3424.0, + "completions/max_terminated_length": 3424.0, + "completions/mean_length": 2148.08349609375, + "completions/mean_terminated_length": 2148.08349609375, + "completions/min_length": 928.0, + "completions/min_terminated_length": 928.0, + "epoch": 0.11872455902306649, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.08867096155881882, + "kl": 0.0, + "learning_rate": 4.484126984126984e-07, + "loss": 0.0015, + "num_tokens": 12173024.0, + "reward": 1.2833333015441895, + "reward_std": 0.040824808180332184, + "rewards/correctness_reward_func/mean": 0.9833333492279053, + "rewards/correctness_reward_func/std": 0.05773502215743065, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 350 + }, + { + "completion_length": 1849.6666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3647.0, + "completions/max_terminated_length": 3647.0, + "completions/mean_length": 1849.666748046875, + "completions/mean_terminated_length": 1849.666748046875, + "completions/min_length": 829.0, + "completions/min_terminated_length": 829.0, + "epoch": 0.11906377204884667, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.13948985934257507, + "kl": 0.0, + "learning_rate": 4.4824016563146996e-07, + "loss": 0.0055, + "num_tokens": 12207754.0, + "reward": 1.1500000953674316, + "reward_std": 0.05477223917841911, + "rewards/correctness_reward_func/mean": 0.8500000834465027, + "rewards/correctness_reward_func/std": 0.09045339375734329, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 351 + }, + { + "completion_length": 1228.916748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2132.0, + "completions/max_terminated_length": 2132.0, + "completions/mean_length": 1228.916748046875, + "completions/mean_terminated_length": 1228.916748046875, + "completions/min_length": 790.0, + "completions/min_terminated_length": 790.0, + "epoch": 0.11940298507462686, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.3335708677768707, + "kl": 0.0, + "learning_rate": 4.480676328502415e-07, + "loss": -0.0039, + "num_tokens": 12235113.0, + "reward": 1.1375000476837158, + "reward_std": 0.20600365102291107, + "rewards/correctness_reward_func/mean": 0.8499999642372131, + "rewards/correctness_reward_func/std": 0.28444522619247437, + "rewards/format_reward_func/mean": 0.2875000238418579, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 352 + }, + { + "completion_length": 1891.416748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4444.0, + "completions/max_terminated_length": 4444.0, + "completions/mean_length": 1891.416748046875, + "completions/mean_terminated_length": 1891.416748046875, + "completions/min_length": 399.0, + "completions/min_terminated_length": 399.0, + "epoch": 0.11974219810040705, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.3772743046283722, + "kl": 0.0, + "learning_rate": 4.478951000690131e-07, + "loss": -0.0017, + "num_tokens": 12268598.0, + "reward": 1.133333444595337, + "reward_std": 0.1632993221282959, + "rewards/correctness_reward_func/mean": 0.8333333134651184, + "rewards/correctness_reward_func/std": 0.2806917726993561, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 353 + }, + { + "completion_length": 1879.8334350585938, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5891.0, + "completions/max_terminated_length": 5891.0, + "completions/mean_length": 1879.8333740234375, + "completions/mean_terminated_length": 1879.8333740234375, + "completions/min_length": 484.0, + "completions/min_terminated_length": 484.0, + "epoch": 0.12008141112618724, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9070640802383423, + "kl": 0.0, + "learning_rate": 4.4772256728778467e-07, + "loss": -0.0369, + "num_tokens": 12300792.0, + "reward": 0.3583333492279053, + "reward_std": 0.24285396933555603, + "rewards/correctness_reward_func/mean": 0.0833333358168602, + "rewards/correctness_reward_func/std": 0.28867512941360474, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.05838742107152939, + "step": 354 + }, + { + "completion_length": 1915.0000610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4033.0, + "completions/max_terminated_length": 4033.0, + "completions/mean_length": 1915.0, + "completions/mean_terminated_length": 1915.0, + "completions/min_length": 814.0, + "completions/min_terminated_length": 814.0, + "epoch": 0.12042062415196743, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.09051363170146942, + "kl": 0.0, + "learning_rate": 4.475500345065562e-07, + "loss": -0.0006, + "num_tokens": 12334404.0, + "reward": 1.1500000953674316, + "reward_std": 0.05477223917841911, + "rewards/correctness_reward_func/mean": 0.8500000834465027, + "rewards/correctness_reward_func/std": 0.09045339375734329, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 355 + }, + { + "completion_length": 1688.0833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3541.0, + "completions/max_terminated_length": 3541.0, + "completions/mean_length": 1688.0833740234375, + "completions/mean_terminated_length": 1688.0833740234375, + "completions/min_length": 497.0, + "completions/min_terminated_length": 497.0, + "epoch": 0.12075983717774763, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0811961218714714, + "kl": 0.0, + "learning_rate": 4.4737750172532777e-07, + "loss": 0.0016, + "num_tokens": 12362599.0, + "reward": 1.133333444595337, + "reward_std": 0.051639772951602936, + "rewards/correctness_reward_func/mean": 0.8333333134651184, + "rewards/correctness_reward_func/std": 0.07784988731145859, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 356 + }, + { + "completion_length": 1835.1666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3861.0, + "completions/max_terminated_length": 3861.0, + "completions/mean_length": 1835.166748046875, + "completions/mean_terminated_length": 1835.166748046875, + "completions/min_length": 697.0, + "completions/min_terminated_length": 697.0, + "epoch": 0.12109905020352782, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 4.472049689440994e-07, + "loss": 0.0, + "num_tokens": 12398781.0, + "reward": 1.2999999523162842, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 1.0, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 357 + }, + { + "completion_length": 983.7500305175781, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1866.0, + "completions/max_terminated_length": 1866.0, + "completions/mean_length": 983.75, + "completions/mean_terminated_length": 983.75, + "completions/min_length": 584.0, + "completions/min_terminated_length": 584.0, + "epoch": 0.12143826322930801, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.9967588116287516e-07, + "kl": 0.0, + "learning_rate": 4.4703243616287093e-07, + "loss": 0.0, + "num_tokens": 12422490.0, + "reward": 1.1000001430511475, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 358 + }, + { + "completion_length": 3115.75, + "completions/clipped_ratio": 0.16666666666666663, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 6423.0, + "completions/mean_length": 4213.9169921875, + "completions/mean_terminated_length": 3738.900146484375, + "completions/min_length": 1909.0, + "completions/min_terminated_length": 1909.0, + "epoch": 0.1217774762550882, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9891613721847534, + "kl": NaN, + "learning_rate": 4.468599033816425e-07, + "loss": -0.0521, + "num_tokens": 12475245.0, + "reward": 0.783333420753479, + "reward_std": 0.2532995343208313, + "rewards/correctness_reward_func/mean": 0.5333333015441895, + "rewards/correctness_reward_func/std": 0.47736650705337524, + "rewards/format_reward_func/mean": 0.2500000298023224, + "rewards/format_reward_func/std": 0.11677484214305878, + "step": 359 + }, + { + "completion_length": 2723.166748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4374.0, + "completions/max_terminated_length": 4374.0, + "completions/mean_length": 2723.166748046875, + "completions/mean_terminated_length": 2723.166748046875, + "completions/min_length": 1459.0, + "completions/min_terminated_length": 1459.0, + "epoch": 0.12211668928086838, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.2058346499088657e-07, + "kl": 0.0, + "learning_rate": 4.4668737060041404e-07, + "loss": 0.0, + "num_tokens": 12520649.0, + "reward": 0.7000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 360 + }, + { + "completion_length": 1992.416748046875, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5684.0, + "completions/mean_length": 2541.5, + "completions/mean_terminated_length": 2173.54541015625, + "completions/min_length": 363.0, + "completions/min_terminated_length": 363.0, + "epoch": 0.12245590230664857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2540488541126251, + "kl": NaN, + "learning_rate": 4.4651483781918564e-07, + "loss": -0.0095, + "num_tokens": 12556066.0, + "reward": 0.6791667342185974, + "reward_std": 0.10357433557510376, + "rewards/correctness_reward_func/mean": 0.4166666567325592, + "rewards/correctness_reward_func/std": 0.4386619031429291, + "rewards/format_reward_func/mean": 0.26250001788139343, + "rewards/format_reward_func/std": 0.09323723614215851, + "step": 361 + }, + { + "completion_length": 2420.5834350585938, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5967.0, + "completions/mean_length": 2969.666748046875, + "completions/mean_terminated_length": 2640.636474609375, + "completions/min_length": 434.0, + "completions/min_terminated_length": 434.0, + "epoch": 0.12279511533242876, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.33543863892555237, + "kl": NaN, + "learning_rate": 4.4634230503795714e-07, + "loss": -0.0596, + "num_tokens": 12599795.0, + "reward": 1.0958333015441895, + "reward_std": 0.2968290448188782, + "rewards/correctness_reward_func/mean": 0.8333333134651184, + "rewards/correctness_reward_func/std": 0.28069180250167847, + "rewards/format_reward_func/mean": 0.26250001788139343, + "rewards/format_reward_func/std": 0.09323723614215851, + "step": 362 + }, + { + "completion_length": 2000.166748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4281.0, + "completions/max_terminated_length": 4281.0, + "completions/mean_length": 2000.166748046875, + "completions/mean_terminated_length": 2000.166748046875, + "completions/min_length": 627.0, + "completions/min_terminated_length": 627.0, + "epoch": 0.12313432835820895, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.47814515233039856, + "kl": 0.0, + "learning_rate": 4.4616977225672875e-07, + "loss": 0.0103, + "num_tokens": 12634171.0, + "reward": 1.066666603088379, + "reward_std": 0.20655910670757294, + "rewards/correctness_reward_func/mean": 0.7666666507720947, + "rewards/correctness_reward_func/std": 0.3700941503047943, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 363 + }, + { + "completion_length": 2920.33349609375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6174.0, + "completions/max_terminated_length": 6174.0, + "completions/mean_length": 2920.33349609375, + "completions/mean_terminated_length": 2920.33349609375, + "completions/min_length": 636.0, + "completions/min_terminated_length": 636.0, + "epoch": 0.12347354138398914, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5874470472335815, + "kl": 0.0, + "learning_rate": 4.4599723947550035e-07, + "loss": 0.0089, + "num_tokens": 12684635.0, + "reward": 1.133333444595337, + "reward_std": 0.25163978338241577, + "rewards/correctness_reward_func/mean": 0.8333333134651184, + "rewards/correctness_reward_func/std": 0.28069180250167847, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 364 + }, + { + "completion_length": 2071.666748046875, + "completions/clipped_ratio": 0.16666666666666663, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5161.0, + "completions/mean_length": 3169.83349609375, + "completions/mean_terminated_length": 2486.0, + "completions/min_length": 726.0, + "completions/min_terminated_length": 726.0, + "epoch": 0.12381275440976934, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.7931068539619446, + "kl": NaN, + "learning_rate": 4.458247066942719e-07, + "loss": -0.0785, + "num_tokens": 12722335.0, + "reward": 0.5166666507720947, + "reward_std": 0.2840188145637512, + "rewards/correctness_reward_func/mean": 0.2666666805744171, + "rewards/correctness_reward_func/std": 0.393892765045166, + "rewards/format_reward_func/mean": 0.2500000298023224, + "rewards/format_reward_func/std": 0.11677484214305878, + "step": 365 + }, + { + "completion_length": 544.8333435058594, + "completions/clipped_ratio": 0.0, + "completions/max_length": 857.0, + "completions/max_terminated_length": 857.0, + "completions/mean_length": 544.8333740234375, + "completions/mean_terminated_length": 544.8333740234375, + "completions/min_length": 386.0, + "completions/min_terminated_length": 386.0, + "epoch": 0.12415196743554953, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.2730981707572937, + "kl": 0.0, + "learning_rate": 4.4565217391304346e-07, + "loss": -0.0016, + "num_tokens": 12739961.0, + "reward": 1.0208334922790527, + "reward_std": 0.19391795992851257, + "rewards/correctness_reward_func/mean": 0.7333333492279053, + "rewards/correctness_reward_func/std": 0.23094011843204498, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 366 + }, + { + "completion_length": 2281.8334350585938, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5693.0, + "completions/mean_length": 2830.916748046875, + "completions/mean_terminated_length": 2489.272705078125, + "completions/min_length": 815.0, + "completions/min_terminated_length": 815.0, + "epoch": 0.12449118046132972, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.7804614305496216, + "kl": NaN, + "learning_rate": 4.45479641131815e-07, + "loss": -0.0158, + "num_tokens": 12782919.0, + "reward": 0.8416666984558105, + "reward_std": 0.1855172961950302, + "rewards/correctness_reward_func/mean": 0.5666666626930237, + "rewards/correctness_reward_func/std": 0.5033223032951355, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.08660253882408142, + "step": 367 + }, + { + "completion_length": 1517.4166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2935.0, + "completions/max_terminated_length": 2935.0, + "completions/mean_length": 1517.416748046875, + "completions/mean_terminated_length": 1517.416748046875, + "completions/min_length": 459.0, + "completions/min_terminated_length": 459.0, + "epoch": 0.1248303934871099, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.2498109924763412e-07, + "kl": 0.0, + "learning_rate": 4.453071083505866e-07, + "loss": 0.0, + "num_tokens": 12811460.0, + "reward": 1.2000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 368 + }, + { + "completion_length": 2193.5833740234375, + "completions/clipped_ratio": 0.16666666666666663, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 4981.0, + "completions/mean_length": 3291.75, + "completions/mean_terminated_length": 2632.300048828125, + "completions/min_length": 1154.0, + "completions/min_terminated_length": 1154.0, + "epoch": 0.1251696065128901, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.574193000793457, + "kl": NaN, + "learning_rate": 4.4513457556935817e-07, + "loss": -0.0544, + "num_tokens": 12849537.0, + "reward": 0.9166667461395264, + "reward_std": 0.2840188145637512, + "rewards/correctness_reward_func/mean": 0.6666666865348816, + "rewards/correctness_reward_func/std": 0.31139957904815674, + "rewards/format_reward_func/mean": 0.2500000298023224, + "rewards/format_reward_func/std": 0.11677484214305878, + "step": 369 + }, + { + "completion_length": 2472.666748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5330.0, + "completions/max_terminated_length": 5330.0, + "completions/mean_length": 2472.666748046875, + "completions/mean_terminated_length": 2472.666748046875, + "completions/min_length": 976.0, + "completions/min_terminated_length": 976.0, + "epoch": 0.1255088195386703, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.123369936020026e-07, + "kl": 0.0, + "learning_rate": 4.449620427881297e-07, + "loss": 0.0, + "num_tokens": 12895625.0, + "reward": 1.1000001430511475, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 370 + }, + { + "completion_length": 3849.166748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5591.0, + "completions/max_terminated_length": 5591.0, + "completions/mean_length": 3849.166748046875, + "completions/mean_terminated_length": 3849.166748046875, + "completions/min_length": 2576.0, + "completions/min_terminated_length": 2576.0, + "epoch": 0.12584803256445048, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.506696343421936, + "kl": 0.0, + "learning_rate": 4.447895100069013e-07, + "loss": -0.0017, + "num_tokens": 12953713.0, + "reward": 0.8666666746139526, + "reward_std": 0.1632993221282959, + "rewards/correctness_reward_func/mean": 0.5666666626930237, + "rewards/correctness_reward_func/std": 0.5033223032951355, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 371 + }, + { + "completion_length": 1875.416748046875, + "completions/clipped_ratio": 0.25, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5215.0, + "completions/mean_length": 3522.666748046875, + "completions/mean_terminated_length": 2500.5556640625, + "completions/min_length": 1012.0, + "completions/min_terminated_length": 1012.0, + "epoch": 0.12618724559023067, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.761985719203949, + "kl": NaN, + "learning_rate": 4.446169772256729e-07, + "loss": -0.085, + "num_tokens": 12991038.0, + "reward": 0.4916667342185974, + "reward_std": 0.4645467698574066, + "rewards/correctness_reward_func/mean": 0.2666666805744171, + "rewards/correctness_reward_func/std": 0.393892765045166, + "rewards/format_reward_func/mean": 0.22500000894069672, + "rewards/format_reward_func/std": 0.13568010926246643, + "step": 372 + }, + { + "completion_length": 2674.416748046875, + "completions/clipped_ratio": 0.16666666666666663, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 6334.0, + "completions/mean_length": 3772.58349609375, + "completions/mean_terminated_length": 3209.300048828125, + "completions/min_length": 1402.0, + "completions/min_terminated_length": 1402.0, + "epoch": 0.12652645861601086, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.10343494266271591, + "kl": NaN, + "learning_rate": 4.444444444444444e-07, + "loss": -0.0178, + "num_tokens": 13036073.0, + "reward": 0.25, + "reward_std": 0.0774596780538559, + "rewards/correctness_reward_func/mean": 0.0, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.2500000298023224, + "rewards/format_reward_func/std": 0.11677484214305878, + "step": 373 + }, + { + "completion_length": 2471.08349609375, + "completions/clipped_ratio": 0.16666666666666663, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 4921.0, + "completions/mean_length": 3569.25, + "completions/mean_terminated_length": 2965.300048828125, + "completions/min_length": 1494.0, + "completions/min_terminated_length": 1494.0, + "epoch": 0.12686567164179105, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7868385910987854, + "kl": NaN, + "learning_rate": 4.44271911663216e-07, + "loss": -0.0781, + "num_tokens": 13080804.0, + "reward": 0.783333420753479, + "reward_std": 0.4425841271877289, + "rewards/correctness_reward_func/mean": 0.5333333611488342, + "rewards/correctness_reward_func/std": 0.3938928246498108, + "rewards/format_reward_func/mean": 0.2500000298023224, + "rewards/format_reward_func/std": 0.11677484214305878, + "step": 374 + }, + { + "completion_length": 1423.1666870117188, + "completions/clipped_ratio": 0.5, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 4723.0, + "completions/mean_length": 4717.6669921875, + "completions/mean_terminated_length": 2846.33349609375, + "completions/min_length": 1608.0, + "completions/min_terminated_length": 1608.0, + "epoch": 0.12720488466757124, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": NaN, + "learning_rate": 4.4409937888198754e-07, + "loss": 0.0, + "num_tokens": 13109138.0, + "reward": 0.6499999761581421, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.5, + "rewards/correctness_reward_func/std": 0.5222329497337341, + "rewards/format_reward_func/mean": 0.15000000596046448, + "rewards/format_reward_func/std": 0.15666989982128143, + "step": 375 + }, + { + "completion_length": 2622.75, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 4907.0, + "completions/mean_length": 3171.83349609375, + "completions/mean_terminated_length": 2861.181884765625, + "completions/min_length": 706.0, + "completions/min_terminated_length": 706.0, + "epoch": 0.12754409769335143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6112473607063293, + "kl": NaN, + "learning_rate": 4.4392684610075915e-07, + "loss": -0.0348, + "num_tokens": 13149719.0, + "reward": 0.9750000834465027, + "reward_std": 0.4121825695037842, + "rewards/correctness_reward_func/mean": 0.699999988079071, + "rewards/correctness_reward_func/std": 0.3357488214969635, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.08660253882408142, + "step": 376 + }, + { + "completion_length": 2402.0001220703125, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5788.0, + "completions/mean_length": 2951.08349609375, + "completions/mean_terminated_length": 2620.36376953125, + "completions/min_length": 1429.0, + "completions/min_terminated_length": 1429.0, + "epoch": 0.12788331071913162, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2551642656326294, + "kl": NaN, + "learning_rate": 4.4375431331953065e-07, + "loss": -0.0368, + "num_tokens": 13188329.0, + "reward": 1.0750000476837158, + "reward_std": 0.2761763334274292, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.2696799635887146, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.08660253882408142, + "step": 377 + }, + { + "completion_length": 2022.0000610351562, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 6286.0, + "completions/mean_length": 2571.08349609375, + "completions/mean_terminated_length": 2205.818359375, + "completions/min_length": 490.0, + "completions/min_terminated_length": 490.0, + "epoch": 0.1282225237449118, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7428491711616516, + "kl": NaN, + "learning_rate": 4.4358178053830225e-07, + "loss": -0.0548, + "num_tokens": 13223171.0, + "reward": 0.6083333492279053, + "reward_std": 0.2925342321395874, + "rewards/correctness_reward_func/mean": 0.38333332538604736, + "rewards/correctness_reward_func/std": 0.4783177673816681, + "rewards/format_reward_func/mean": 0.2250000238418579, + "rewards/format_reward_func/std": 0.10112998634576797, + "step": 378 + }, + { + "completion_length": 877.5000457763672, + "completions/clipped_ratio": 0.33333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 4776.0, + "completions/mean_length": 3073.83349609375, + "completions/mean_terminated_length": 1316.25, + "completions/min_length": 463.0, + "completions/min_terminated_length": 463.0, + "epoch": 0.128561736770692, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.17597438395023346, + "kl": NaN, + "learning_rate": 4.4340924775707386e-07, + "loss": -0.019, + "num_tokens": 13243979.0, + "reward": 0.699999988079071, + "reward_std": 0.0774596780538559, + "rewards/correctness_reward_func/mean": 0.5, + "rewards/correctness_reward_func/std": 0.5222329497337341, + "rewards/format_reward_func/mean": 0.20000000298023224, + "rewards/format_reward_func/std": 0.14770980179309845, + "step": 379 + }, + { + "completion_length": 1323.1666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1966.0, + "completions/max_terminated_length": 1966.0, + "completions/mean_length": 1323.166748046875, + "completions/mean_terminated_length": 1323.166748046875, + "completions/min_length": 777.0, + "completions/min_terminated_length": 777.0, + "epoch": 0.12890094979647218, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 4.432367149758454e-07, + "loss": 0.0, + "num_tokens": 13267591.0, + "reward": 1.2999999523162842, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 1.0, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 380 + }, + { + "completion_length": 1451.8333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3437.0, + "completions/max_terminated_length": 3437.0, + "completions/mean_length": 1451.8333740234375, + "completions/mean_terminated_length": 1451.8333740234375, + "completions/min_length": 423.0, + "completions/min_terminated_length": 423.0, + "epoch": 0.12924016282225237, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.7834616899490356, + "kl": 0.0, + "learning_rate": 4.4306418219461696e-07, + "loss": -0.0123, + "num_tokens": 13297223.0, + "reward": 0.8666667342185974, + "reward_std": 0.2581988573074341, + "rewards/correctness_reward_func/mean": 0.5666666626930237, + "rewards/correctness_reward_func/std": 0.42497774958610535, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 381 + }, + { + "completion_length": 1094.0833435058594, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2571.0, + "completions/max_terminated_length": 2571.0, + "completions/mean_length": 1094.0833740234375, + "completions/mean_terminated_length": 1094.0833740234375, + "completions/min_length": 384.0, + "completions/min_terminated_length": 384.0, + "epoch": 0.12957937584803256, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.6127665364583663e-07, + "kl": 0.0, + "learning_rate": 4.428916494133885e-07, + "loss": 0.0, + "num_tokens": 13324314.0, + "reward": 0.7000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 382 + }, + { + "completion_length": 2174.916748046875, + "completions/clipped_ratio": 0.5, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5623.0, + "completions/mean_length": 5469.4169921875, + "completions/mean_terminated_length": 4349.83349609375, + "completions/min_length": 3537.0, + "completions/min_terminated_length": 3537.0, + "epoch": 0.12991858887381275, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.7466797232627869, + "kl": NaN, + "learning_rate": 4.427191166321601e-07, + "loss": 0.0017, + "num_tokens": 13360529.0, + "reward": 0.3999999761581421, + "reward_std": 0.27386125922203064, + "rewards/correctness_reward_func/mean": 0.25, + "rewards/correctness_reward_func/std": 0.45226702094078064, + "rewards/format_reward_func/mean": 0.15000000596046448, + "rewards/format_reward_func/std": 0.15666989982128143, + "step": 383 + }, + { + "completion_length": 2885.0001220703125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5507.0, + "completions/max_terminated_length": 5507.0, + "completions/mean_length": 2885.0, + "completions/mean_terminated_length": 2885.0, + "completions/min_length": 690.0, + "completions/min_terminated_length": 690.0, + "epoch": 0.13025780189959293, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.003652934325146e-08, + "kl": 0.0, + "learning_rate": 4.425465838509316e-07, + "loss": 0.0, + "num_tokens": 13409909.0, + "reward": 0.7000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 384 + }, + { + "completion_length": 1805.916748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3747.0, + "completions/max_terminated_length": 3747.0, + "completions/mean_length": 1805.916748046875, + "completions/mean_terminated_length": 1805.916748046875, + "completions/min_length": 892.0, + "completions/min_terminated_length": 892.0, + "epoch": 0.13059701492537312, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.4168512523174286, + "kl": 0.0, + "learning_rate": 4.4237405106970323e-07, + "loss": -0.0029, + "num_tokens": 13443088.0, + "reward": 1.0208334922790527, + "reward_std": 0.19391795992851257, + "rewards/correctness_reward_func/mean": 0.7333333492279053, + "rewards/correctness_reward_func/std": 0.23094011843204498, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 385 + }, + { + "completion_length": 1178.0000610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2213.0, + "completions/max_terminated_length": 2213.0, + "completions/mean_length": 1178.0, + "completions/mean_terminated_length": 1178.0, + "completions/min_length": 848.0, + "completions/min_terminated_length": 848.0, + "epoch": 0.1309362279511533, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14581476151943207, + "kl": 0.0, + "learning_rate": 4.422015182884748e-07, + "loss": -0.0007, + "num_tokens": 13469020.0, + "reward": 1.2166666984558105, + "reward_std": 0.10641199350357056, + "rewards/correctness_reward_func/mean": 0.9166666865348816, + "rewards/correctness_reward_func/std": 0.10298572480678558, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 386 + }, + { + "completion_length": 1169.8333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1971.0, + "completions/max_terminated_length": 1971.0, + "completions/mean_length": 1169.8333740234375, + "completions/mean_terminated_length": 1169.8333740234375, + "completions/min_length": 731.0, + "completions/min_terminated_length": 731.0, + "epoch": 0.13127544097693353, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0867735743522644, + "kl": 0.0, + "learning_rate": 4.420289855072464e-07, + "loss": -0.0019, + "num_tokens": 13495550.0, + "reward": 0.7666666507720947, + "reward_std": 0.051639750599861145, + "rewards/correctness_reward_func/mean": 0.46666666865348816, + "rewards/correctness_reward_func/std": 0.4923659563064575, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 387 + }, + { + "completion_length": 1863.1666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5399.0, + "completions/max_terminated_length": 5399.0, + "completions/mean_length": 1863.166748046875, + "completions/mean_terminated_length": 1863.166748046875, + "completions/min_length": 535.0, + "completions/min_terminated_length": 535.0, + "epoch": 0.13161465400271372, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.49067234992980957, + "kl": 0.0, + "learning_rate": 4.418564527260179e-07, + "loss": 0.0198, + "num_tokens": 13532824.0, + "reward": 1.2000000476837158, + "reward_std": 0.20000001788139343, + "rewards/correctness_reward_func/mean": 0.9000000357627869, + "rewards/correctness_reward_func/std": 0.28919950127601624, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 388 + }, + { + "completion_length": 1987.416748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4437.0, + "completions/max_terminated_length": 4437.0, + "completions/mean_length": 1987.416748046875, + "completions/mean_terminated_length": 1987.416748046875, + "completions/min_length": 896.0, + "completions/min_terminated_length": 896.0, + "epoch": 0.1319538670284939, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6929435133934021, + "kl": 0.0, + "learning_rate": 4.416839199447895e-07, + "loss": -0.0, + "num_tokens": 13566951.0, + "reward": 0.7750000953674316, + "reward_std": 0.2602938413619995, + "rewards/correctness_reward_func/mean": 0.4999999701976776, + "rewards/correctness_reward_func/std": 0.4472135901451111, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.05838742107152939, + "step": 389 + }, + { + "completion_length": 1799.5000610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2813.0, + "completions/max_terminated_length": 2813.0, + "completions/mean_length": 1799.5, + "completions/mean_terminated_length": 1799.5, + "completions/min_length": 1107.0, + "completions/min_terminated_length": 1107.0, + "epoch": 0.1322930800542741, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.12015525251626968, + "kl": 0.0, + "learning_rate": 4.4151138716356105e-07, + "loss": 0.0012, + "num_tokens": 13601895.0, + "reward": 1.1500000953674316, + "reward_std": 0.05477223917841911, + "rewards/correctness_reward_func/mean": 0.8499999642372131, + "rewards/correctness_reward_func/std": 0.09045340120792389, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 390 + }, + { + "completion_length": 1699.4166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4197.0, + "completions/max_terminated_length": 4197.0, + "completions/mean_length": 1699.416748046875, + "completions/mean_terminated_length": 1699.416748046875, + "completions/min_length": 847.0, + "completions/min_terminated_length": 847.0, + "epoch": 0.13263229308005428, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.39857861399650574, + "kl": 0.0, + "learning_rate": 4.4133885438233265e-07, + "loss": -0.0103, + "num_tokens": 13631660.0, + "reward": 1.1166667938232422, + "reward_std": 0.20412413775920868, + "rewards/correctness_reward_func/mean": 0.8166666030883789, + "rewards/correctness_reward_func/std": 0.27579087018966675, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 391 + }, + { + "completion_length": 1202.5833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2096.0, + "completions/max_terminated_length": 2096.0, + "completions/mean_length": 1202.5833740234375, + "completions/mean_terminated_length": 1202.5833740234375, + "completions/min_length": 643.0, + "completions/min_terminated_length": 643.0, + "epoch": 0.13297150610583447, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3004428446292877, + "kl": 0.0, + "learning_rate": 4.4116632160110415e-07, + "loss": 0.0008, + "num_tokens": 13657545.0, + "reward": 1.0375001430511475, + "reward_std": 0.23474279046058655, + "rewards/correctness_reward_func/mean": 0.75, + "rewards/correctness_reward_func/std": 0.24308621883392334, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 392 + }, + { + "completion_length": 3542.166748046875, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5556.0, + "completions/mean_length": 4091.25, + "completions/mean_terminated_length": 3864.181884765625, + "completions/min_length": 2148.0, + "completions/min_terminated_length": 2148.0, + "epoch": 0.13331071913161466, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1583491563796997, + "kl": NaN, + "learning_rate": 4.4099378881987576e-07, + "loss": -0.0071, + "num_tokens": 13712039.0, + "reward": 0.21250002086162567, + "reward_std": 0.06934845447540283, + "rewards/correctness_reward_func/mean": 0.0, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.21250002086162567, + "rewards/format_reward_func/std": 0.07723929733037949, + "step": 393 + }, + { + "completion_length": 1208.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2981.0, + "completions/max_terminated_length": 2981.0, + "completions/mean_length": 1208.5, + "completions/mean_terminated_length": 1208.5, + "completions/min_length": 619.0, + "completions/min_terminated_length": 619.0, + "epoch": 0.13364993215739485, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.10130270570516586, + "kl": 0.0, + "learning_rate": 4.4082125603864736e-07, + "loss": 0.0026, + "num_tokens": 13737101.0, + "reward": 1.133333444595337, + "reward_std": 0.05163976177573204, + "rewards/correctness_reward_func/mean": 0.8333333134651184, + "rewards/correctness_reward_func/std": 0.07784988731145859, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 394 + }, + { + "completion_length": 2585.0001220703125, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5495.0, + "completions/mean_length": 3134.08349609375, + "completions/mean_terminated_length": 2820.0, + "completions/min_length": 1315.0, + "completions/min_terminated_length": 1315.0, + "epoch": 0.13398914518317503, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.608137845993042, + "kl": NaN, + "learning_rate": 4.4064872325741886e-07, + "loss": -0.0506, + "num_tokens": 13782419.0, + "reward": 1.058333396911621, + "reward_std": 0.2877541482448578, + "rewards/correctness_reward_func/mean": 0.7833333015441895, + "rewards/correctness_reward_func/std": 0.26227444410324097, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.08660253882408142, + "step": 395 + }, + { + "completion_length": 3074.166748046875, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5930.0, + "completions/mean_length": 3623.25, + "completions/mean_terminated_length": 3353.636474609375, + "completions/min_length": 1629.0, + "completions/min_terminated_length": 1629.0, + "epoch": 0.13432835820895522, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.5104953646659851, + "kl": NaN, + "learning_rate": 4.4047619047619047e-07, + "loss": -0.0287, + "num_tokens": 13833343.0, + "reward": 0.9416667819023132, + "reward_std": 0.24983328580856323, + "rewards/correctness_reward_func/mean": 0.6666666865348816, + "rewards/correctness_reward_func/std": 0.31139957904815674, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.08660254627466202, + "step": 396 + }, + { + "completion_length": 1857.0833740234375, + "completions/clipped_ratio": 0.5833333333333333, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5661.0, + "completions/mean_length": 5700.6669921875, + "completions/mean_terminated_length": 4457.0, + "completions/min_length": 3203.0, + "completions/min_terminated_length": 3203.0, + "epoch": 0.1346675712347354, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1551879197359085, + "kl": NaN, + "learning_rate": 4.40303657694962e-07, + "loss": -0.0141, + "num_tokens": 13864922.0, + "reward": 0.13750001788139343, + "reward_std": 0.09185586869716644, + "rewards/correctness_reward_func/mean": 0.0, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.13750000298023224, + "rewards/format_reward_func/std": 0.14943073689937592, + "step": 397 + }, + { + "completion_length": 3534.75, + "completions/clipped_ratio": 0.16666666666666663, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 6351.0, + "completions/mean_length": 4632.9169921875, + "completions/mean_terminated_length": 4241.7001953125, + "completions/min_length": 1820.0, + "completions/min_terminated_length": 1820.0, + "epoch": 0.1350067842605156, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.24700386822223663, + "kl": NaN, + "learning_rate": 4.4013112491373363e-07, + "loss": -0.0244, + "num_tokens": 13922897.0, + "reward": 0.6500000953674316, + "reward_std": 0.0774596780538559, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.2500000298023224, + "rewards/format_reward_func/std": 0.11677484214305878, + "step": 398 + }, + { + "completion_length": 997.1666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1720.0, + "completions/max_terminated_length": 1720.0, + "completions/mean_length": 997.1666870117188, + "completions/mean_terminated_length": 997.1666870117188, + "completions/min_length": 494.0, + "completions/min_terminated_length": 494.0, + "epoch": 0.1353459972862958, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.05267763510346413, + "kl": 0.0, + "learning_rate": 4.3995859213250513e-07, + "loss": 0.0, + "num_tokens": 13944649.0, + "reward": 1.1875, + "reward_std": 0.030618607997894287, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 399 + }, + { + "completion_length": 2047.666748046875, + "completions/clipped_ratio": 0.33333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 6290.0, + "completions/mean_length": 4244.0, + "completions/mean_terminated_length": 3071.5, + "completions/min_length": 1685.0, + "completions/min_terminated_length": 1685.0, + "epoch": 0.13568521031207598, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.15240563452243805, + "kl": NaN, + "learning_rate": 4.3978605935127673e-07, + "loss": -0.0224, + "num_tokens": 13983681.0, + "reward": 0.6250001192092896, + "reward_std": 0.06708204001188278, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.22500000894069672, + "rewards/format_reward_func/std": 0.11965861916542053, + "step": 400 + }, + { + "completion_length": 1873.4166870117188, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 6427.0, + "completions/mean_length": 2422.5, + "completions/mean_terminated_length": 2043.727294921875, + "completions/min_length": 580.0, + "completions/min_terminated_length": 580.0, + "epoch": 0.13602442333785617, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0715797170996666, + "kl": NaN, + "learning_rate": 4.396135265700483e-07, + "loss": -0.0117, + "num_tokens": 14017424.0, + "reward": 0.7749999761581421, + "reward_std": 0.06123724579811096, + "rewards/correctness_reward_func/mean": 0.5, + "rewards/correctness_reward_func/std": 0.5222329497337341, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.08660253882408142, + "step": 401 + }, + { + "completion_length": 2485.5833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3410.0, + "completions/max_terminated_length": 3410.0, + "completions/mean_length": 2485.58349609375, + "completions/mean_terminated_length": 2485.58349609375, + "completions/min_length": 1527.0, + "completions/min_terminated_length": 1527.0, + "epoch": 0.13636363636363635, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.7927711698794155e-07, + "kl": 0.0, + "learning_rate": 4.394409937888199e-07, + "loss": 0.0, + "num_tokens": 14057679.0, + "reward": 1.1000001430511475, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 402 + }, + { + "completion_length": 979.1666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1800.0, + "completions/max_terminated_length": 1800.0, + "completions/mean_length": 979.1666870117188, + "completions/mean_terminated_length": 979.1666870117188, + "completions/min_length": 479.0, + "completions/min_terminated_length": 479.0, + "epoch": 0.13670284938941654, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12159288674592972, + "kl": 0.0, + "learning_rate": 4.392684610075914e-07, + "loss": 0.0013, + "num_tokens": 14083013.0, + "reward": 1.254166603088379, + "reward_std": 0.08225837349891663, + "rewards/correctness_reward_func/mean": 0.9666666984558105, + "rewards/correctness_reward_func/std": 0.07784988731145859, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 403 + }, + { + "completion_length": 784.6666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1312.0, + "completions/max_terminated_length": 1312.0, + "completions/mean_length": 784.6666870117188, + "completions/mean_terminated_length": 784.6666870117188, + "completions/min_length": 473.0, + "completions/min_terminated_length": 473.0, + "epoch": 0.13704206241519673, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.06961729377508163, + "kl": 0.0, + "learning_rate": 4.39095928226363e-07, + "loss": -0.0009, + "num_tokens": 14105899.0, + "reward": 1.2166666984558105, + "reward_std": 0.040824826806783676, + "rewards/correctness_reward_func/mean": 0.9166666865348816, + "rewards/correctness_reward_func/std": 0.10298572480678558, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 404 + }, + { + "completion_length": 1378.8333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2759.0, + "completions/max_terminated_length": 2759.0, + "completions/mean_length": 1378.8333740234375, + "completions/mean_terminated_length": 1378.8333740234375, + "completions/min_length": 540.0, + "completions/min_terminated_length": 540.0, + "epoch": 0.13738127544097695, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.12275088578462601, + "kl": 0.0, + "learning_rate": 4.3892339544513455e-07, + "loss": -0.0004, + "num_tokens": 14135267.0, + "reward": 0.7333333492279053, + "reward_std": 0.05163976177573204, + "rewards/correctness_reward_func/mean": 0.43333330750465393, + "rewards/correctness_reward_func/std": 0.45792683959007263, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 405 + }, + { + "completion_length": 1384.6666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2235.0, + "completions/max_terminated_length": 2235.0, + "completions/mean_length": 1384.666748046875, + "completions/mean_terminated_length": 1384.666748046875, + "completions/min_length": 715.0, + "completions/min_terminated_length": 715.0, + "epoch": 0.13772048846675713, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.1979372516179865e-07, + "kl": 0.0, + "learning_rate": 4.387508626639061e-07, + "loss": 0.0, + "num_tokens": 14159677.0, + "reward": 0.7000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 406 + }, + { + "completion_length": 1979.0833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5850.0, + "completions/max_terminated_length": 5850.0, + "completions/mean_length": 1979.0833740234375, + "completions/mean_terminated_length": 1979.0833740234375, + "completions/min_length": 654.0, + "completions/min_terminated_length": 654.0, + "epoch": 0.13805970149253732, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.5809157490730286, + "kl": 0.0, + "learning_rate": 4.3857832988267766e-07, + "loss": 0.0523, + "num_tokens": 14196080.0, + "reward": 0.9666668176651001, + "reward_std": 0.20655912160873413, + "rewards/correctness_reward_func/mean": 0.6666666865348816, + "rewards/correctness_reward_func/std": 0.31139957904815674, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 407 + }, + { + "completion_length": 2523.916748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4132.0, + "completions/max_terminated_length": 4132.0, + "completions/mean_length": 2523.916748046875, + "completions/mean_terminated_length": 2523.916748046875, + "completions/min_length": 1115.0, + "completions/min_terminated_length": 1115.0, + "epoch": 0.1383989145183175, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.13894988596439362, + "kl": 0.0, + "learning_rate": 4.3840579710144926e-07, + "loss": -0.0037, + "num_tokens": 14239801.0, + "reward": 1.133333444595337, + "reward_std": 0.05163976177573204, + "rewards/correctness_reward_func/mean": 0.8333333134651184, + "rewards/correctness_reward_func/std": 0.07784988731145859, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 408 + }, + { + "completion_length": 1763.7500457763672, + "completions/clipped_ratio": 0.25, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 6448.0, + "completions/mean_length": 3411.0, + "completions/mean_terminated_length": 2351.666748046875, + "completions/min_length": 401.0, + "completions/min_terminated_length": 401.0, + "epoch": 0.1387381275440977, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8226115107536316, + "kl": NaN, + "learning_rate": 4.3823326432022087e-07, + "loss": -0.1241, + "num_tokens": 14273902.0, + "reward": 0.7458333969116211, + "reward_std": 0.4951653480529785, + "rewards/correctness_reward_func/mean": 0.5333333611488342, + "rewards/correctness_reward_func/std": 0.3938928246498108, + "rewards/format_reward_func/mean": 0.21250002086162567, + "rewards/format_reward_func/std": 0.13505050539970398, + "step": 409 + }, + { + "completion_length": 1476.4166870117188, + "completions/clipped_ratio": 0.25, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5934.0, + "completions/mean_length": 3123.666748046875, + "completions/mean_terminated_length": 1968.5555419921875, + "completions/min_length": 708.0, + "completions/min_terminated_length": 708.0, + "epoch": 0.1390773405698779, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1162936687469482, + "kl": NaN, + "learning_rate": 4.3806073153899237e-07, + "loss": -0.082, + "num_tokens": 14304171.0, + "reward": 0.7583334445953369, + "reward_std": 0.27095508575439453, + "rewards/correctness_reward_func/mean": 0.5333333611488342, + "rewards/correctness_reward_func/std": 0.3938928246498108, + "rewards/format_reward_func/mean": 0.22500000894069672, + "rewards/format_reward_func/std": 0.13568010926246643, + "step": 410 + }, + { + "completion_length": 3505.5833740234375, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 6032.0, + "completions/mean_length": 4054.666748046875, + "completions/mean_terminated_length": 3824.27294921875, + "completions/min_length": 1969.0, + "completions/min_terminated_length": 1969.0, + "epoch": 0.13941655359565808, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.07667145878076553, + "kl": NaN, + "learning_rate": 4.3788819875776397e-07, + "loss": -0.015, + "num_tokens": 14359846.0, + "reward": 0.6750000715255737, + "reward_std": 0.06123724579811096, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.08660253882408142, + "step": 411 + }, + { + "completion_length": 1065.6667175292969, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2098.0, + "completions/max_terminated_length": 2098.0, + "completions/mean_length": 1065.666748046875, + "completions/mean_terminated_length": 1065.666748046875, + "completions/min_length": 619.0, + "completions/min_terminated_length": 619.0, + "epoch": 0.13975576662143827, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.403994083404541, + "kl": 0.0, + "learning_rate": 4.377156659765355e-07, + "loss": 0.0055, + "num_tokens": 14381556.0, + "reward": 0.9874999523162842, + "reward_std": 0.2497076690196991, + "rewards/correctness_reward_func/mean": 0.699999988079071, + "rewards/correctness_reward_func/std": 0.43064433336257935, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 412 + }, + { + "completion_length": 1031.8333740234375, + "completions/clipped_ratio": 0.41666666666666663, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 6426.0, + "completions/mean_length": 3777.25, + "completions/mean_terminated_length": 1768.857177734375, + "completions/min_length": 713.0, + "completions/min_terminated_length": 713.0, + "epoch": 0.14009497964721845, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6786950826644897, + "kl": NaN, + "learning_rate": 4.3754313319530713e-07, + "loss": -0.0741, + "num_tokens": 14406148.0, + "reward": 0.7291666865348816, + "reward_std": 0.25515520572662354, + "rewards/correctness_reward_func/mean": 0.5666666626930237, + "rewards/correctness_reward_func/std": 0.5033223032951355, + "rewards/format_reward_func/mean": 0.16250000894069672, + "rewards/format_reward_func/std": 0.14943073689937592, + "step": 413 + }, + { + "completion_length": 2563.166748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5917.0, + "completions/max_terminated_length": 5917.0, + "completions/mean_length": 2563.166748046875, + "completions/mean_terminated_length": 2563.166748046875, + "completions/min_length": 726.0, + "completions/min_terminated_length": 726.0, + "epoch": 0.14043419267299864, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6143797039985657, + "kl": 0.0, + "learning_rate": 4.3737060041407863e-07, + "loss": 0.0111, + "num_tokens": 14447670.0, + "reward": 0.9541667699813843, + "reward_std": 0.22716552019119263, + "rewards/correctness_reward_func/mean": 0.6666666865348816, + "rewards/correctness_reward_func/std": 0.31139957904815674, + "rewards/format_reward_func/mean": 0.2875000238418579, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 414 + }, + { + "completion_length": 909.0833435058594, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2567.0, + "completions/max_terminated_length": 2567.0, + "completions/mean_length": 909.0833740234375, + "completions/mean_terminated_length": 909.0833740234375, + "completions/min_length": 504.0, + "completions/min_terminated_length": 504.0, + "epoch": 0.14077340569877883, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.1539605654652405e-07, + "kl": 0.0, + "learning_rate": 4.3719806763285024e-07, + "loss": 0.0, + "num_tokens": 14472895.0, + "reward": 1.1000001430511475, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 415 + }, + { + "completion_length": 2452.0834350585938, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5666.0, + "completions/max_terminated_length": 5666.0, + "completions/mean_length": 2452.08349609375, + "completions/mean_terminated_length": 2452.08349609375, + "completions/min_length": 695.0, + "completions/min_terminated_length": 695.0, + "epoch": 0.14111261872455902, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4528708755970001, + "kl": 0.0, + "learning_rate": 4.370255348516218e-07, + "loss": -0.0006, + "num_tokens": 14513966.0, + "reward": 0.9708334803581238, + "reward_std": 0.2679903507232666, + "rewards/correctness_reward_func/mean": 0.6833333373069763, + "rewards/correctness_reward_func/std": 0.32427072525024414, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 416 + }, + { + "completion_length": 695.9166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1177.0, + "completions/max_terminated_length": 1177.0, + "completions/mean_length": 695.9166870117188, + "completions/mean_terminated_length": 695.9166870117188, + "completions/min_length": 431.0, + "completions/min_terminated_length": 431.0, + "epoch": 0.1414518317503392, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.090449757874012, + "kl": 0.0, + "learning_rate": 4.3685300207039334e-07, + "loss": 0.0007, + "num_tokens": 14531941.0, + "reward": 1.2333333492279053, + "reward_std": 0.051639772951602936, + "rewards/correctness_reward_func/mean": 0.9333333373069763, + "rewards/correctness_reward_func/std": 0.0984731912612915, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 417 + }, + { + "completion_length": 2134.6666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5264.0, + "completions/max_terminated_length": 5264.0, + "completions/mean_length": 2134.666748046875, + "completions/mean_terminated_length": 2134.666748046875, + "completions/min_length": 599.0, + "completions/min_terminated_length": 599.0, + "epoch": 0.1417910447761194, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.5045371651649475, + "kl": 0.0, + "learning_rate": 4.366804692891649e-07, + "loss": 0.0213, + "num_tokens": 14567709.0, + "reward": 0.9000000953674316, + "reward_std": 0.21908903121948242, + "rewards/correctness_reward_func/mean": 0.6000000238418579, + "rewards/correctness_reward_func/std": 0.36181360483169556, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 418 + }, + { + "completion_length": 2361.166748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4302.0, + "completions/max_terminated_length": 4302.0, + "completions/mean_length": 2361.166748046875, + "completions/mean_terminated_length": 2361.166748046875, + "completions/min_length": 1277.0, + "completions/min_terminated_length": 1277.0, + "epoch": 0.14213025780189958, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.11170516163110733, + "kl": 0.0, + "learning_rate": 4.365079365079365e-07, + "loss": 0.0027, + "num_tokens": 14606201.0, + "reward": 1.1666667461395264, + "reward_std": 0.051639750599861145, + "rewards/correctness_reward_func/mean": 0.8666666150093079, + "rewards/correctness_reward_func/std": 0.0984731912612915, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 419 + }, + { + "completion_length": 1313.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2266.0, + "completions/max_terminated_length": 2266.0, + "completions/mean_length": 1313.75, + "completions/mean_terminated_length": 1313.75, + "completions/min_length": 760.0, + "completions/min_terminated_length": 760.0, + "epoch": 0.14246947082767977, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.5472624542999256e-07, + "kl": 0.0, + "learning_rate": 4.3633540372670805e-07, + "loss": 0.0, + "num_tokens": 14639696.0, + "reward": 1.1000001430511475, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 420 + }, + { + "completion_length": 3634.750244140625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5848.0, + "completions/max_terminated_length": 5848.0, + "completions/mean_length": 3634.75, + "completions/mean_terminated_length": 3634.75, + "completions/min_length": 1178.0, + "completions/min_terminated_length": 1178.0, + "epoch": 0.14280868385345996, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2318860292434692, + "kl": 0.0, + "learning_rate": 4.361628709454796e-07, + "loss": -0.0021, + "num_tokens": 14693759.0, + "reward": 0.7583333849906921, + "reward_std": 0.40680140256881714, + "rewards/correctness_reward_func/mean": 0.4833333492279053, + "rewards/correctness_reward_func/std": 0.5078176856040955, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.05838742107152939, + "step": 421 + }, + { + "completion_length": 2677.7500610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6421.0, + "completions/max_terminated_length": 6421.0, + "completions/mean_length": 2677.75, + "completions/mean_terminated_length": 2677.75, + "completions/min_length": 1236.0, + "completions/min_terminated_length": 1236.0, + "epoch": 0.14314789687924015, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6643301844596863, + "kl": 0.0, + "learning_rate": 4.3599033816425116e-07, + "loss": -0.0141, + "num_tokens": 14740298.0, + "reward": 0.9000000953674316, + "reward_std": 0.21908903121948242, + "rewards/correctness_reward_func/mean": 0.6000000238418579, + "rewards/correctness_reward_func/std": 0.36181360483169556, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 422 + }, + { + "completion_length": 1688.0000610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2633.0, + "completions/max_terminated_length": 2633.0, + "completions/mean_length": 1688.0, + "completions/mean_terminated_length": 1688.0, + "completions/min_length": 949.0, + "completions/min_terminated_length": 949.0, + "epoch": 0.14348710990502037, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.688114761895122e-07, + "kl": 0.0, + "learning_rate": 4.3581780538302277e-07, + "loss": 0.0, + "num_tokens": 14770574.0, + "reward": 1.1000001430511475, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 423 + }, + { + "completion_length": 1518.0000610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2738.0, + "completions/max_terminated_length": 2738.0, + "completions/mean_length": 1518.0, + "completions/mean_terminated_length": 1518.0, + "completions/min_length": 905.0, + "completions/min_terminated_length": 905.0, + "epoch": 0.14382632293080055, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.07617669552564621, + "kl": 0.0, + "learning_rate": 4.356452726017943e-07, + "loss": -0.0, + "num_tokens": 14802632.0, + "reward": 0.7166666984558105, + "reward_std": 0.040824826806783676, + "rewards/correctness_reward_func/mean": 0.4166666567325592, + "rewards/correctness_reward_func/std": 0.4386619031429291, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 424 + }, + { + "completion_length": 2456.166748046875, + "completions/clipped_ratio": 0.5, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 6572.0, + "completions/mean_length": 5750.6669921875, + "completions/mean_terminated_length": 4912.33349609375, + "completions/min_length": 1737.0, + "completions/min_terminated_length": 1737.0, + "epoch": 0.14416553595658074, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7163939476013184, + "kl": NaN, + "learning_rate": 4.3547273982056587e-07, + "loss": -0.0691, + "num_tokens": 14845864.0, + "reward": 0.2958333492279053, + "reward_std": 0.29602330923080444, + "rewards/correctness_reward_func/mean": 0.13333334028720856, + "rewards/correctness_reward_func/std": 0.3113996088504791, + "rewards/format_reward_func/mean": 0.16250000894069672, + "rewards/format_reward_func/std": 0.14943073689937592, + "step": 425 + }, + { + "completion_length": 1030.8333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1618.0, + "completions/max_terminated_length": 1618.0, + "completions/mean_length": 1030.8333740234375, + "completions/mean_terminated_length": 1030.8333740234375, + "completions/min_length": 699.0, + "completions/min_terminated_length": 699.0, + "epoch": 0.14450474898236093, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10956176370382309, + "kl": 0.0, + "learning_rate": 4.353002070393375e-07, + "loss": 0.0006, + "num_tokens": 14864090.0, + "reward": 1.183333396911621, + "reward_std": 0.09246458858251572, + "rewards/correctness_reward_func/mean": 0.8833333849906921, + "rewards/correctness_reward_func/std": 0.10298573225736618, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 426 + }, + { + "completion_length": 919.3333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3516.0, + "completions/max_terminated_length": 3516.0, + "completions/mean_length": 919.3333740234375, + "completions/mean_terminated_length": 919.3333740234375, + "completions/min_length": 373.0, + "completions/min_terminated_length": 373.0, + "epoch": 0.14484396200814112, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.11441392451524734, + "kl": 0.0, + "learning_rate": 4.3512767425810903e-07, + "loss": -0.0054, + "num_tokens": 14889018.0, + "reward": 0.7166666984558105, + "reward_std": 0.040824826806783676, + "rewards/correctness_reward_func/mean": 0.4166666567325592, + "rewards/correctness_reward_func/std": 0.4386619031429291, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 427 + }, + { + "completion_length": 3217.2501220703125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6538.0, + "completions/max_terminated_length": 6538.0, + "completions/mean_length": 3217.25, + "completions/mean_terminated_length": 3217.25, + "completions/min_length": 1158.0, + "completions/min_terminated_length": 1158.0, + "epoch": 0.1451831750339213, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.20972996950149536, + "kl": 0.0, + "learning_rate": 4.349551414768806e-07, + "loss": -0.0038, + "num_tokens": 14938941.0, + "reward": 1.1666667461395264, + "reward_std": 0.05163975805044174, + "rewards/correctness_reward_func/mean": 0.8666666150093079, + "rewards/correctness_reward_func/std": 0.0984731912612915, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 428 + }, + { + "completion_length": 2134.9166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3320.0, + "completions/max_terminated_length": 3320.0, + "completions/mean_length": 2134.916748046875, + "completions/mean_terminated_length": 2134.916748046875, + "completions/min_length": 1154.0, + "completions/min_terminated_length": 1154.0, + "epoch": 0.1455223880597015, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.10739568620920181, + "kl": 0.0, + "learning_rate": 4.3478260869565214e-07, + "loss": 0.0037, + "num_tokens": 14972300.0, + "reward": 1.25, + "reward_std": 0.05477223917841911, + "rewards/correctness_reward_func/mean": 0.949999988079071, + "rewards/correctness_reward_func/std": 0.09045339375734329, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 429 + }, + { + "completion_length": 1214.7500610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3529.0, + "completions/max_terminated_length": 3529.0, + "completions/mean_length": 1214.75, + "completions/mean_terminated_length": 1214.75, + "completions/min_length": 545.0, + "completions/min_terminated_length": 545.0, + "epoch": 0.14586160108548168, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3598671853542328, + "kl": 0.0, + "learning_rate": 4.3461007591442374e-07, + "loss": 0.0009, + "num_tokens": 15001169.0, + "reward": 1.129166603088379, + "reward_std": 0.261197566986084, + "rewards/correctness_reward_func/mean": 0.9166666865348816, + "rewards/correctness_reward_func/std": 0.28867512941360474, + "rewards/format_reward_func/mean": 0.21250002086162567, + "rewards/format_reward_func/std": 0.07723929733037949, + "step": 430 + }, + { + "completion_length": 1586.6666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3801.0, + "completions/max_terminated_length": 3801.0, + "completions/mean_length": 1586.666748046875, + "completions/mean_terminated_length": 1586.666748046875, + "completions/min_length": 387.0, + "completions/min_terminated_length": 387.0, + "epoch": 0.14620081411126187, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.477949922445987e-07, + "kl": 0.0, + "learning_rate": 4.344375431331953e-07, + "loss": 0.0, + "num_tokens": 15033133.0, + "reward": 1.2000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 431 + }, + { + "completion_length": 859.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1384.0, + "completions/max_terminated_length": 1384.0, + "completions/mean_length": 859.0, + "completions/mean_terminated_length": 859.0, + "completions/min_length": 447.0, + "completions/min_terminated_length": 447.0, + "epoch": 0.14654002713704206, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.039414018392562866, + "kl": 0.0, + "learning_rate": 4.3426501035196685e-07, + "loss": -0.001, + "num_tokens": 15053815.0, + "reward": 0.6875001788139343, + "reward_std": 0.03061862289905548, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 432 + }, + { + "completion_length": 740.6666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1106.0, + "completions/max_terminated_length": 1106.0, + "completions/mean_length": 740.6666870117188, + "completions/mean_terminated_length": 740.6666870117188, + "completions/min_length": 488.0, + "completions/min_terminated_length": 488.0, + "epoch": 0.14687924016282225, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07417774200439453, + "kl": 0.0, + "learning_rate": 4.340924775707384e-07, + "loss": -0.0005, + "num_tokens": 15074877.0, + "reward": 1.2708332538604736, + "reward_std": 0.07144343107938766, + "rewards/correctness_reward_func/mean": 0.9833333492279053, + "rewards/correctness_reward_func/std": 0.05773502215743065, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 433 + }, + { + "completion_length": 3072.166748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4774.0, + "completions/max_terminated_length": 4774.0, + "completions/mean_length": 3072.166748046875, + "completions/mean_terminated_length": 3072.166748046875, + "completions/min_length": 1584.0, + "completions/min_terminated_length": 1584.0, + "epoch": 0.14721845318860244, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.7396959012548905e-07, + "kl": 0.0, + "learning_rate": 4.3391994478951e-07, + "loss": 0.0, + "num_tokens": 15122993.0, + "reward": 0.7000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 434 + }, + { + "completion_length": 1638.3333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2871.0, + "completions/max_terminated_length": 2871.0, + "completions/mean_length": 1638.3333740234375, + "completions/mean_terminated_length": 1638.3333740234375, + "completions/min_length": 621.0, + "completions/min_terminated_length": 621.0, + "epoch": 0.14755766621438263, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.3176340846475796e-07, + "kl": 0.0, + "learning_rate": 4.3374741200828156e-07, + "loss": 0.0, + "num_tokens": 15153723.0, + "reward": 1.2000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 435 + }, + { + "completion_length": 2581.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5435.0, + "completions/max_terminated_length": 5435.0, + "completions/mean_length": 2581.0, + "completions/mean_terminated_length": 2581.0, + "completions/min_length": 1202.0, + "completions/min_terminated_length": 1202.0, + "epoch": 0.14789687924016282, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.1409706026315689, + "kl": 0.0, + "learning_rate": 4.335748792270531e-07, + "loss": -0.0058, + "num_tokens": 15193863.0, + "reward": 1.1500000953674316, + "reward_std": 0.05477223917841911, + "rewards/correctness_reward_func/mean": 0.8500000834465027, + "rewards/correctness_reward_func/std": 0.09045339375734329, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 436 + }, + { + "completion_length": 3028.5833740234375, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5637.0, + "completions/mean_length": 3577.666748046875, + "completions/mean_terminated_length": 3303.9091796875, + "completions/min_length": 1284.0, + "completions/min_terminated_length": 1284.0, + "epoch": 0.148236092265943, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1644480973482132, + "kl": NaN, + "learning_rate": 4.3340234644582466e-07, + "loss": -0.0131, + "num_tokens": 15246178.0, + "reward": 0.7416666746139526, + "reward_std": 0.1128769963979721, + "rewards/correctness_reward_func/mean": 0.46666666865348816, + "rewards/correctness_reward_func/std": 0.4923659563064575, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.08660253882408142, + "step": 437 + }, + { + "completion_length": 3253.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5697.0, + "completions/max_terminated_length": 5697.0, + "completions/mean_length": 3253.0, + "completions/mean_terminated_length": 3253.0, + "completions/min_length": 1295.0, + "completions/min_terminated_length": 1295.0, + "epoch": 0.1485753052917232, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.13437378406524658, + "kl": 0.0, + "learning_rate": 4.3322981366459627e-07, + "loss": 0.001, + "num_tokens": 15299776.0, + "reward": 1.25, + "reward_std": 0.05477223917841911, + "rewards/correctness_reward_func/mean": 0.949999988079071, + "rewards/correctness_reward_func/std": 0.09045339375734329, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 438 + }, + { + "completion_length": 1089.7500610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2189.0, + "completions/max_terminated_length": 2189.0, + "completions/mean_length": 1089.75, + "completions/mean_terminated_length": 1089.75, + "completions/min_length": 693.0, + "completions/min_terminated_length": 693.0, + "epoch": 0.14891451831750338, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4965176284313202, + "kl": 0.0, + "learning_rate": 4.3305728088336777e-07, + "loss": 0.0096, + "num_tokens": 15323659.0, + "reward": 1.1000001430511475, + "reward_std": 0.23782965540885925, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.2696799635887146, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 439 + }, + { + "completion_length": 1152.7500610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2723.0, + "completions/max_terminated_length": 2723.0, + "completions/mean_length": 1152.75, + "completions/mean_terminated_length": 1152.75, + "completions/min_length": 355.0, + "completions/min_terminated_length": 355.0, + "epoch": 0.14925373134328357, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.04821237921714783, + "kl": 0.0, + "learning_rate": 4.328847481021394e-07, + "loss": -0.0003, + "num_tokens": 15347458.0, + "reward": 1.1875, + "reward_std": 0.030618607997894287, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 440 + }, + { + "completion_length": 2592.0001220703125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4795.0, + "completions/max_terminated_length": 4795.0, + "completions/mean_length": 2592.0, + "completions/mean_terminated_length": 2592.0, + "completions/min_length": 818.0, + "completions/min_terminated_length": 818.0, + "epoch": 0.14959294436906379, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.51977240850465e-07, + "kl": 0.0, + "learning_rate": 4.32712215320911e-07, + "loss": 0.0, + "num_tokens": 15391168.0, + "reward": 0.7000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 441 + }, + { + "completion_length": 2777.666748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4143.0, + "completions/max_terminated_length": 4143.0, + "completions/mean_length": 2777.666748046875, + "completions/mean_terminated_length": 2777.666748046875, + "completions/min_length": 1074.0, + "completions/min_terminated_length": 1074.0, + "epoch": 0.14993215739484397, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.47685208916664124, + "kl": 0.0, + "learning_rate": 4.3253968253968253e-07, + "loss": 0.0155, + "num_tokens": 15437178.0, + "reward": 1.0208334922790527, + "reward_std": 0.19391795992851257, + "rewards/correctness_reward_func/mean": 0.7333333492279053, + "rewards/correctness_reward_func/std": 0.2309401035308838, + "rewards/format_reward_func/mean": 0.2875000238418579, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 442 + }, + { + "completion_length": 1874.3333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4018.0, + "completions/max_terminated_length": 4018.0, + "completions/mean_length": 1874.3333740234375, + "completions/mean_terminated_length": 1874.3333740234375, + "completions/min_length": 866.0, + "completions/min_terminated_length": 866.0, + "epoch": 0.15027137042062416, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.1033775582909584, + "kl": 0.0, + "learning_rate": 4.323671497584541e-07, + "loss": -0.0001, + "num_tokens": 15471610.0, + "reward": 1.1166667938232422, + "reward_std": 0.040824826806783676, + "rewards/correctness_reward_func/mean": 0.8166666626930237, + "rewards/correctness_reward_func/std": 0.05773502215743065, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 443 + }, + { + "completion_length": 1630.0833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4842.0, + "completions/max_terminated_length": 4842.0, + "completions/mean_length": 1630.0833740234375, + "completions/mean_terminated_length": 1630.0833740234375, + "completions/min_length": 564.0, + "completions/min_terminated_length": 564.0, + "epoch": 0.15061058344640435, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.08019285649061203, + "kl": 0.0, + "learning_rate": 4.3219461697722564e-07, + "loss": -0.0026, + "num_tokens": 15504059.0, + "reward": 1.2833333015441895, + "reward_std": 0.040824808180332184, + "rewards/correctness_reward_func/mean": 0.9833333492279053, + "rewards/correctness_reward_func/std": 0.05773502215743065, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 444 + }, + { + "completion_length": 1034.2500610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2473.0, + "completions/max_terminated_length": 2473.0, + "completions/mean_length": 1034.25, + "completions/mean_terminated_length": 1034.25, + "completions/min_length": 401.0, + "completions/min_terminated_length": 401.0, + "epoch": 0.15094979647218454, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.2533315896234853e-07, + "kl": 0.0, + "learning_rate": 4.3202208419599725e-07, + "loss": 0.0, + "num_tokens": 15531314.0, + "reward": 0.7000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 445 + }, + { + "completion_length": 1931.1666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4648.0, + "completions/max_terminated_length": 4648.0, + "completions/mean_length": 1931.166748046875, + "completions/mean_terminated_length": 1931.166748046875, + "completions/min_length": 519.0, + "completions/min_terminated_length": 519.0, + "epoch": 0.15128900949796473, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.7577066421508789, + "kl": 0.0, + "learning_rate": 4.318495514147688e-07, + "loss": 0.0376, + "num_tokens": 15569032.0, + "reward": 0.833333432674408, + "reward_std": 0.20655912160873413, + "rewards/correctness_reward_func/mean": 0.5333333611488342, + "rewards/correctness_reward_func/std": 0.3938928246498108, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 446 + }, + { + "completion_length": 2384.5001220703125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4492.0, + "completions/max_terminated_length": 4492.0, + "completions/mean_length": 2384.5, + "completions/mean_terminated_length": 2384.5, + "completions/min_length": 647.0, + "completions/min_terminated_length": 647.0, + "epoch": 0.15162822252374492, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.2805997133255005, + "kl": 0.0, + "learning_rate": 4.3167701863354035e-07, + "loss": -0.0, + "num_tokens": 15613108.0, + "reward": 1.1375000476837158, + "reward_std": 0.07373939454555511, + "rewards/correctness_reward_func/mean": 0.8500000834465027, + "rewards/correctness_reward_func/std": 0.09045339375734329, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 447 + }, + { + "completion_length": 429.8333435058594, + "completions/clipped_ratio": 0.5, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 1278.0, + "completions/mean_length": 3724.33349609375, + "completions/mean_terminated_length": 859.6666870117188, + "completions/min_length": 591.0, + "completions/min_terminated_length": 591.0, + "epoch": 0.1519674355495251, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.0236065151048024e-07, + "kl": NaN, + "learning_rate": 4.315044858523119e-07, + "loss": 0.0, + "num_tokens": 15632792.0, + "reward": 0.5500000715255737, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.15000000596046448, + "rewards/format_reward_func/std": 0.15666989982128143, + "step": 448 + }, + { + "completion_length": 2166.666748046875, + "completions/clipped_ratio": 0.16666666666666663, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5026.0, + "completions/mean_length": 3264.83349609375, + "completions/mean_terminated_length": 2600.0, + "completions/min_length": 1176.0, + "completions/min_terminated_length": 1176.0, + "epoch": 0.1523066485753053, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.06184069439768791, + "kl": NaN, + "learning_rate": 4.313319530710835e-07, + "loss": -0.0104, + "num_tokens": 15670522.0, + "reward": 0.6750000715255737, + "reward_std": 0.06123724579811096, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.08660254627466202, + "step": 449 + }, + { + "completion_length": 2323.5000610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6337.0, + "completions/max_terminated_length": 6337.0, + "completions/mean_length": 2323.5, + "completions/mean_terminated_length": 2323.5, + "completions/min_length": 855.0, + "completions/min_terminated_length": 855.0, + "epoch": 0.15264586160108548, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.8851024508476257, + "kl": 0.0, + "learning_rate": 4.3115942028985506e-07, + "loss": 0.0646, + "num_tokens": 15709432.0, + "reward": 1.0166667699813843, + "reward_std": 0.24832776188850403, + "rewards/correctness_reward_func/mean": 0.7166666984558105, + "rewards/correctness_reward_func/std": 0.3459725081920624, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 450 + }, + { + "completion_length": 2351.5001220703125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4691.0, + "completions/max_terminated_length": 4691.0, + "completions/mean_length": 2351.5, + "completions/mean_terminated_length": 2351.5, + "completions/min_length": 829.0, + "completions/min_terminated_length": 829.0, + "epoch": 0.15298507462686567, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.1750710743617674e-07, + "kl": 0.0, + "learning_rate": 4.309868875086266e-07, + "loss": 0.0, + "num_tokens": 15750946.0, + "reward": 0.7000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 451 + }, + { + "completion_length": 2530.0001220703125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5296.0, + "completions/max_terminated_length": 5296.0, + "completions/mean_length": 2530.0, + "completions/mean_terminated_length": 2530.0, + "completions/min_length": 480.0, + "completions/min_terminated_length": 480.0, + "epoch": 0.15332428765264586, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1575542837381363, + "kl": 0.0, + "learning_rate": 4.3081435472739817e-07, + "loss": 0.0015, + "num_tokens": 15794548.0, + "reward": 1.1541666984558105, + "reward_std": 0.08225835859775543, + "rewards/correctness_reward_func/mean": 0.8666666150093079, + "rewards/correctness_reward_func/std": 0.0984731912612915, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 452 + }, + { + "completion_length": 1643.3333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2960.0, + "completions/max_terminated_length": 2960.0, + "completions/mean_length": 1643.3333740234375, + "completions/mean_terminated_length": 1643.3333740234375, + "completions/min_length": 744.0, + "completions/min_terminated_length": 744.0, + "epoch": 0.15366350067842605, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.257485422456739e-07, + "kl": 0.0, + "learning_rate": 4.306418219461698e-07, + "loss": 0.0, + "num_tokens": 15827666.0, + "reward": 1.2000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 453 + }, + { + "completion_length": 1459.916748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2622.0, + "completions/max_terminated_length": 2622.0, + "completions/mean_length": 1459.916748046875, + "completions/mean_terminated_length": 1459.916748046875, + "completions/min_length": 502.0, + "completions/min_terminated_length": 502.0, + "epoch": 0.15400271370420623, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 4.304692891649413e-07, + "loss": 0.0, + "num_tokens": 15854089.0, + "reward": 0.7999999523162842, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.5, + "rewards/correctness_reward_func/std": 0.5222329497337341, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 454 + }, + { + "completion_length": 1496.5000610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2525.0, + "completions/max_terminated_length": 2525.0, + "completions/mean_length": 1496.5, + "completions/mean_terminated_length": 1496.5, + "completions/min_length": 720.0, + "completions/min_terminated_length": 720.0, + "epoch": 0.15434192672998642, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 4.302967563837129e-07, + "loss": 0.0, + "num_tokens": 15882739.0, + "reward": 0.7999999523162842, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.5, + "rewards/correctness_reward_func/std": 0.5222329497337341, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 455 + }, + { + "completion_length": 1633.3333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4204.0, + "completions/max_terminated_length": 4204.0, + "completions/mean_length": 1633.3333740234375, + "completions/mean_terminated_length": 1633.3333740234375, + "completions/min_length": 531.0, + "completions/min_terminated_length": 531.0, + "epoch": 0.1546811397557666, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.5038626194000244, + "kl": 0.0, + "learning_rate": 4.301242236024845e-07, + "loss": 0.0028, + "num_tokens": 15915797.0, + "reward": 1.066666603088379, + "reward_std": 0.20655912160873413, + "rewards/correctness_reward_func/mean": 0.7666666507720947, + "rewards/correctness_reward_func/std": 0.3700941801071167, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 456 + }, + { + "completion_length": 2144.416748046875, + "completions/clipped_ratio": 0.16666666666666663, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 4475.0, + "completions/mean_length": 3242.58349609375, + "completions/mean_terminated_length": 2573.300048828125, + "completions/min_length": 863.0, + "completions/min_terminated_length": 863.0, + "epoch": 0.1550203527815468, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2132185697555542, + "kl": NaN, + "learning_rate": 4.2995169082125604e-07, + "loss": -0.0736, + "num_tokens": 15955018.0, + "reward": 0.9791667461395264, + "reward_std": 0.3328944146633148, + "rewards/correctness_reward_func/mean": 0.7166666984558105, + "rewards/correctness_reward_func/std": 0.3459725081920624, + "rewards/format_reward_func/mean": 0.26250001788139343, + "rewards/format_reward_func/std": 0.09323723614215851, + "step": 457 + }, + { + "completion_length": 2428.7501220703125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3740.0, + "completions/max_terminated_length": 3740.0, + "completions/mean_length": 2428.75, + "completions/mean_terminated_length": 2428.75, + "completions/min_length": 1041.0, + "completions/min_terminated_length": 1041.0, + "epoch": 0.155359565807327, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.09832356125116348, + "kl": 0.0, + "learning_rate": 4.297791580400276e-07, + "loss": 0.0012, + "num_tokens": 15992965.0, + "reward": 1.1875, + "reward_std": 0.03061862848699093, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.2875000238418579, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 458 + }, + { + "completion_length": 1184.166748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2296.0, + "completions/max_terminated_length": 2296.0, + "completions/mean_length": 1184.166748046875, + "completions/mean_terminated_length": 1184.166748046875, + "completions/min_length": 524.0, + "completions/min_terminated_length": 524.0, + "epoch": 0.1556987788331072, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.07985512912273407, + "kl": 0.0, + "learning_rate": 4.2960662525879914e-07, + "loss": 0.0001, + "num_tokens": 16022223.0, + "reward": 1.25, + "reward_std": 0.05477223917841911, + "rewards/correctness_reward_func/mean": 0.949999988079071, + "rewards/correctness_reward_func/std": 0.09045339375734329, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 459 + }, + { + "completion_length": 2334.7501220703125, + "completions/clipped_ratio": 0.16666666666666663, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5195.0, + "completions/mean_length": 3432.916748046875, + "completions/mean_terminated_length": 2801.699951171875, + "completions/min_length": 1164.0, + "completions/min_terminated_length": 1164.0, + "epoch": 0.1560379918588874, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.36306536197662354, + "kl": NaN, + "learning_rate": 4.2943409247757075e-07, + "loss": -0.0326, + "num_tokens": 16063908.0, + "reward": 0.9166667461395264, + "reward_std": 0.2840188145637512, + "rewards/correctness_reward_func/mean": 0.6666666865348816, + "rewards/correctness_reward_func/std": 0.31139957904815674, + "rewards/format_reward_func/mean": 0.2500000298023224, + "rewards/format_reward_func/std": 0.11677484214305878, + "step": 460 + }, + { + "completion_length": 3764.7501220703125, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 6420.0, + "completions/mean_length": 4313.83349609375, + "completions/mean_terminated_length": 4107.0, + "completions/min_length": 2831.0, + "completions/min_terminated_length": 2831.0, + "epoch": 0.15637720488466758, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.2848891615867615, + "kl": NaN, + "learning_rate": 4.292615596963423e-07, + "loss": -0.0579, + "num_tokens": 16124367.0, + "reward": 0.6083333492279053, + "reward_std": 0.22453658282756805, + "rewards/correctness_reward_func/mean": 0.3333333432674408, + "rewards/correctness_reward_func/std": 0.4119429290294647, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.08660253882408142, + "step": 461 + }, + { + "completion_length": 1265.8333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3036.0, + "completions/max_terminated_length": 3036.0, + "completions/mean_length": 1265.8333740234375, + "completions/mean_terminated_length": 1265.8333740234375, + "completions/min_length": 575.0, + "completions/min_terminated_length": 575.0, + "epoch": 0.15671641791044777, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.3097565770149231, + "kl": 0.0, + "learning_rate": 4.2908902691511386e-07, + "loss": 0.0004, + "num_tokens": 16151689.0, + "reward": 0.5541666746139526, + "reward_std": 0.19900795817375183, + "rewards/correctness_reward_func/mean": 0.2666666805744171, + "rewards/correctness_reward_func/std": 0.393892765045166, + "rewards/format_reward_func/mean": 0.2875000238418579, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 462 + }, + { + "completion_length": 2514.33349609375, + "completions/clipped_ratio": 0.16666666666666663, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 6211.0, + "completions/mean_length": 3612.5, + "completions/mean_terminated_length": 3017.199951171875, + "completions/min_length": 594.0, + "completions/min_terminated_length": 594.0, + "epoch": 0.15705563093622796, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7741003632545471, + "kl": NaN, + "learning_rate": 4.289164941338854e-07, + "loss": -0.045, + "num_tokens": 16194395.0, + "reward": 0.5, + "reward_std": 0.35132092237472534, + "rewards/correctness_reward_func/mean": 0.25, + "rewards/correctness_reward_func/std": 0.45226702094078064, + "rewards/format_reward_func/mean": 0.25, + "rewards/format_reward_func/std": 0.11677484214305878, + "step": 463 + }, + { + "completion_length": 1660.9166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2979.0, + "completions/max_terminated_length": 2979.0, + "completions/mean_length": 1660.916748046875, + "completions/mean_terminated_length": 1660.916748046875, + "completions/min_length": 679.0, + "completions/min_terminated_length": 679.0, + "epoch": 0.15739484396200815, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.09117577224969864, + "kl": 0.0, + "learning_rate": 4.28743961352657e-07, + "loss": -0.0022, + "num_tokens": 16227238.0, + "reward": 1.2708332538604736, + "reward_std": 0.045871179550886154, + "rewards/correctness_reward_func/mean": 0.9833333492279053, + "rewards/correctness_reward_func/std": 0.05773502215743065, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 464 + }, + { + "completion_length": 2607.5001220703125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6129.0, + "completions/max_terminated_length": 6129.0, + "completions/mean_length": 2607.5, + "completions/mean_terminated_length": 2607.5, + "completions/min_length": 1522.0, + "completions/min_terminated_length": 1522.0, + "epoch": 0.15773405698778833, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.12152191996574402, + "kl": 0.0, + "learning_rate": 4.285714285714285e-07, + "loss": -0.0006, + "num_tokens": 16271164.0, + "reward": 1.1666667461395264, + "reward_std": 0.05163975805044174, + "rewards/correctness_reward_func/mean": 0.8666666150093079, + "rewards/correctness_reward_func/std": 0.0984731912612915, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 465 + }, + { + "completion_length": 2674.166748046875, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5492.0, + "completions/mean_length": 3223.25, + "completions/mean_terminated_length": 2917.272705078125, + "completions/min_length": 643.0, + "completions/min_terminated_length": 643.0, + "epoch": 0.15807327001356852, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6836945414543152, + "kl": NaN, + "learning_rate": 4.283988957902001e-07, + "loss": -0.0139, + "num_tokens": 16311852.0, + "reward": 0.8583333492279053, + "reward_std": 0.28804606199264526, + "rewards/correctness_reward_func/mean": 0.5833333134651184, + "rewards/correctness_reward_func/std": 0.4386619031429291, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.08660254627466202, + "step": 466 + }, + { + "completion_length": 616.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1650.0, + "completions/max_terminated_length": 1650.0, + "completions/mean_length": 616.0, + "completions/mean_terminated_length": 616.0, + "completions/min_length": 374.0, + "completions/min_terminated_length": 374.0, + "epoch": 0.1584124830393487, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.06278982013463974, + "kl": 0.0, + "learning_rate": 4.2822636300897167e-07, + "loss": 0.0002, + "num_tokens": 16331100.0, + "reward": 1.1166667938232422, + "reward_std": 0.040824826806783676, + "rewards/correctness_reward_func/mean": 0.8166666626930237, + "rewards/correctness_reward_func/std": 0.05773502215743065, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 467 + }, + { + "completion_length": 1923.1666870117188, + "completions/clipped_ratio": 0.41666666666666663, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 6093.0, + "completions/mean_length": 4668.58349609375, + "completions/mean_terminated_length": 3296.857177734375, + "completions/min_length": 932.0, + "completions/min_terminated_length": 932.0, + "epoch": 0.1587516960651289, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7384070158004761, + "kl": NaN, + "learning_rate": 4.280538302277433e-07, + "loss": 0.0206, + "num_tokens": 16367228.0, + "reward": 0.37916669249534607, + "reward_std": 0.3163924813270569, + "rewards/correctness_reward_func/mean": 0.21666665375232697, + "rewards/correctness_reward_func/std": 0.39504510164260864, + "rewards/format_reward_func/mean": 0.16250000894069672, + "rewards/format_reward_func/std": 0.14943073689937592, + "step": 468 + }, + { + "completion_length": 2877.7501220703125, + "completions/clipped_ratio": 0.16666666666666663, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5040.0, + "completions/mean_length": 3975.916748046875, + "completions/mean_terminated_length": 3453.300048828125, + "completions/min_length": 1252.0, + "completions/min_terminated_length": 1252.0, + "epoch": 0.1590909090909091, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8427397012710571, + "kl": NaN, + "learning_rate": 4.278812974465148e-07, + "loss": -0.0179, + "num_tokens": 16411697.0, + "reward": 0.4624999761581421, + "reward_std": 0.35593757033348083, + "rewards/correctness_reward_func/mean": 0.25, + "rewards/correctness_reward_func/std": 0.45226702094078064, + "rewards/format_reward_func/mean": 0.21250002086162567, + "rewards/format_reward_func/std": 0.11894422769546509, + "step": 469 + }, + { + "completion_length": 971.0000305175781, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1860.0, + "completions/max_terminated_length": 1860.0, + "completions/mean_length": 971.0, + "completions/mean_terminated_length": 971.0, + "completions/min_length": 584.0, + "completions/min_terminated_length": 584.0, + "epoch": 0.15943012211668928, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.582651520948275e-07, + "kl": 0.0, + "learning_rate": 4.277087646652864e-07, + "loss": 0.0, + "num_tokens": 16437101.0, + "reward": 1.2000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 470 + }, + { + "completion_length": 877.5833435058594, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1520.0, + "completions/max_terminated_length": 1520.0, + "completions/mean_length": 877.5833740234375, + "completions/mean_terminated_length": 877.5833740234375, + "completions/min_length": 359.0, + "completions/min_terminated_length": 359.0, + "epoch": 0.15976933514246947, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.317609041929245, + "kl": 0.0, + "learning_rate": 4.2753623188405794e-07, + "loss": -0.0005, + "num_tokens": 16457586.0, + "reward": 0.6791666746139526, + "reward_std": 0.22598153352737427, + "rewards/correctness_reward_func/mean": 0.4166666567325592, + "rewards/correctness_reward_func/std": 0.5149286985397339, + "rewards/format_reward_func/mean": 0.26250001788139343, + "rewards/format_reward_func/std": 0.06784005463123322, + "step": 471 + }, + { + "completion_length": 1876.166748046875, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 3251.0, + "completions/mean_length": 2425.25, + "completions/mean_terminated_length": 2046.727294921875, + "completions/min_length": 1071.0, + "completions/min_terminated_length": 1071.0, + "epoch": 0.16010854816824965, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.21075165271759033, + "kl": NaN, + "learning_rate": 4.2736369910282954e-07, + "loss": -0.0226, + "num_tokens": 16495232.0, + "reward": 1.0250000953674316, + "reward_std": 0.23611438274383545, + "rewards/correctness_reward_func/mean": 0.75, + "rewards/correctness_reward_func/std": 0.24308621883392334, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.08660254627466202, + "step": 472 + }, + { + "completion_length": 2999.3333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5298.0, + "completions/max_terminated_length": 5298.0, + "completions/mean_length": 2999.33349609375, + "completions/mean_terminated_length": 2999.33349609375, + "completions/min_length": 931.0, + "completions/min_terminated_length": 931.0, + "epoch": 0.16044776119402984, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6389744877815247, + "kl": 0.0, + "learning_rate": 4.271911663216011e-07, + "loss": -0.0009, + "num_tokens": 16545672.0, + "reward": 0.6333333253860474, + "reward_std": 0.1632993221282959, + "rewards/correctness_reward_func/mean": 0.3333333432674408, + "rewards/correctness_reward_func/std": 0.4119429290294647, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 473 + }, + { + "completion_length": 1288.7500610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3679.0, + "completions/max_terminated_length": 3679.0, + "completions/mean_length": 1288.75, + "completions/mean_terminated_length": 1288.75, + "completions/min_length": 322.0, + "completions/min_terminated_length": 322.0, + "epoch": 0.16078697421981003, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.43943166732788086, + "kl": 0.0, + "learning_rate": 4.2701863354037265e-07, + "loss": 0.0136, + "num_tokens": 16575927.0, + "reward": 0.6125000715255737, + "reward_std": 0.21714738011360168, + "rewards/correctness_reward_func/mean": 0.3500000238418579, + "rewards/correctness_reward_func/std": 0.4358898997306824, + "rewards/format_reward_func/mean": 0.26250001788139343, + "rewards/format_reward_func/std": 0.06784005463123322, + "step": 474 + }, + { + "completion_length": 1136.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2292.0, + "completions/max_terminated_length": 2292.0, + "completions/mean_length": 1136.25, + "completions/mean_terminated_length": 1136.25, + "completions/min_length": 605.0, + "completions/min_terminated_length": 605.0, + "epoch": 0.16112618724559022, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.04957255721092224, + "kl": 0.0, + "learning_rate": 4.2684610075914425e-07, + "loss": 0.0003, + "num_tokens": 16598586.0, + "reward": 0.6875001788139343, + "reward_std": 0.03061862289905548, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 475 + }, + { + "completion_length": 3438.916748046875, + "completions/clipped_ratio": 0.25, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5689.0, + "completions/mean_length": 5086.1669921875, + "completions/mean_terminated_length": 4585.22216796875, + "completions/min_length": 3284.0, + "completions/min_terminated_length": 3284.0, + "epoch": 0.1614654002713704, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.16727101802825928, + "kl": NaN, + "learning_rate": 4.2667356797791575e-07, + "loss": -0.0145, + "num_tokens": 16649081.0, + "reward": 0.1875, + "reward_std": 0.041079193353652954, + "rewards/correctness_reward_func/mean": 0.0, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.1875, + "rewards/format_reward_func/std": 0.12990382313728333, + "step": 476 + }, + { + "completion_length": 2295.2501220703125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5441.0, + "completions/max_terminated_length": 5441.0, + "completions/mean_length": 2295.25, + "completions/mean_terminated_length": 2295.25, + "completions/min_length": 550.0, + "completions/min_terminated_length": 550.0, + "epoch": 0.16180461329715062, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6933732032775879, + "kl": 0.0, + "learning_rate": 4.2650103519668736e-07, + "loss": 0.019, + "num_tokens": 16686392.0, + "reward": 1.0833334922790527, + "reward_std": 0.19407901167869568, + "rewards/correctness_reward_func/mean": 0.7833333015441895, + "rewards/correctness_reward_func/std": 0.26227444410324097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 477 + }, + { + "completion_length": 2071.3333740234375, + "completions/clipped_ratio": 0.25, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5041.0, + "completions/mean_length": 3718.58349609375, + "completions/mean_terminated_length": 2761.77783203125, + "completions/min_length": 963.0, + "completions/min_terminated_length": 963.0, + "epoch": 0.1621438263229308, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9527459740638733, + "kl": NaN, + "learning_rate": 4.263285024154589e-07, + "loss": -0.0759, + "num_tokens": 16724532.0, + "reward": 0.7916667461395264, + "reward_std": 0.31841057538986206, + "rewards/correctness_reward_func/mean": 0.5666666626930237, + "rewards/correctness_reward_func/std": 0.42497774958610535, + "rewards/format_reward_func/mean": 0.2250000238418579, + "rewards/format_reward_func/std": 0.11965861171483994, + "step": 478 + }, + { + "completion_length": 2586.0001220703125, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 6358.0, + "completions/mean_length": 3135.08349609375, + "completions/mean_terminated_length": 2821.091064453125, + "completions/min_length": 727.0, + "completions/min_terminated_length": 727.0, + "epoch": 0.162483039348711, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6366803646087646, + "kl": NaN, + "learning_rate": 4.261559696342305e-07, + "loss": -0.0671, + "num_tokens": 16771362.0, + "reward": 1.191666603088379, + "reward_std": 0.26536136865615845, + "rewards/correctness_reward_func/mean": 0.9166666865348816, + "rewards/correctness_reward_func/std": 0.28867512941360474, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.08660253882408142, + "step": 479 + }, + { + "completion_length": 712.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1226.0, + "completions/max_terminated_length": 1226.0, + "completions/mean_length": 712.5, + "completions/mean_terminated_length": 712.5, + "completions/min_length": 472.0, + "completions/min_terminated_length": 472.0, + "epoch": 0.1628222523744912, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 4.25983436853002e-07, + "loss": 0.0, + "num_tokens": 16790934.0, + "reward": 1.2999999523162842, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 1.0, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 480 + }, + { + "completion_length": 1154.6666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2626.0, + "completions/max_terminated_length": 2626.0, + "completions/mean_length": 1154.666748046875, + "completions/mean_terminated_length": 1154.666748046875, + "completions/min_length": 672.0, + "completions/min_terminated_length": 672.0, + "epoch": 0.16316146540027138, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 4.258109040717736e-07, + "loss": 0.0, + "num_tokens": 16819826.0, + "reward": 0.30000001192092896, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.0, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 481 + }, + { + "completion_length": 3743.750244140625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6112.0, + "completions/max_terminated_length": 6112.0, + "completions/mean_length": 3743.75, + "completions/mean_terminated_length": 3743.75, + "completions/min_length": 1972.0, + "completions/min_terminated_length": 1972.0, + "epoch": 0.16350067842605157, + "frac_reward_zero_std": 1.0, + "grad_norm": 4.1540701545272896e-07, + "kl": 0.0, + "learning_rate": 4.256383712905452e-07, + "loss": 0.0, + "num_tokens": 16876589.0, + "reward": 1.1000001430511475, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 482 + }, + { + "completion_length": 2762.916748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5686.0, + "completions/max_terminated_length": 5686.0, + "completions/mean_length": 2762.916748046875, + "completions/mean_terminated_length": 2762.916748046875, + "completions/min_length": 1178.0, + "completions/min_terminated_length": 1178.0, + "epoch": 0.16383989145183175, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 4.254658385093168e-07, + "loss": 0.0, + "num_tokens": 16923526.0, + "reward": 0.7999999523162842, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.5, + "rewards/correctness_reward_func/std": 0.5222329497337341, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 483 + }, + { + "completion_length": 2833.1666870117188, + "completions/clipped_ratio": 0.33333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 6037.0, + "completions/mean_length": 5029.5, + "completions/mean_terminated_length": 4249.75, + "completions/min_length": 2548.0, + "completions/min_terminated_length": 2548.0, + "epoch": 0.16417910447761194, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6901265978813171, + "kl": NaN, + "learning_rate": 4.252933057280883e-07, + "loss": -0.0956, + "num_tokens": 16970076.0, + "reward": 0.7333334684371948, + "reward_std": 0.2840188145637512, + "rewards/correctness_reward_func/mean": 0.5333333611488342, + "rewards/correctness_reward_func/std": 0.3938928246498108, + "rewards/format_reward_func/mean": 0.20000000298023224, + "rewards/format_reward_func/std": 0.14770980179309845, + "step": 484 + }, + { + "completion_length": 1643.6666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2812.0, + "completions/max_terminated_length": 2812.0, + "completions/mean_length": 1643.666748046875, + "completions/mean_terminated_length": 1643.666748046875, + "completions/min_length": 969.0, + "completions/min_terminated_length": 969.0, + "epoch": 0.16451831750339213, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 4.251207729468599e-07, + "loss": 0.0, + "num_tokens": 17005076.0, + "reward": 0.7999999523162842, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.5, + "rewards/correctness_reward_func/std": 0.5222329497337341, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 485 + }, + { + "completion_length": 1820.8333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3135.0, + "completions/max_terminated_length": 3135.0, + "completions/mean_length": 1820.8333740234375, + "completions/mean_terminated_length": 1820.8333740234375, + "completions/min_length": 499.0, + "completions/min_terminated_length": 499.0, + "epoch": 0.16485753052917232, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6359934210777283, + "kl": 0.0, + "learning_rate": 4.2494824016563144e-07, + "loss": 0.0045, + "num_tokens": 17038380.0, + "reward": 1.004166841506958, + "reward_std": 0.28193777799606323, + "rewards/correctness_reward_func/mean": 0.7166666984558105, + "rewards/correctness_reward_func/std": 0.3459725081920624, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 486 + }, + { + "completion_length": 1532.4166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3784.0, + "completions/max_terminated_length": 3784.0, + "completions/mean_length": 1532.416748046875, + "completions/mean_terminated_length": 1532.416748046875, + "completions/min_length": 406.0, + "completions/min_terminated_length": 406.0, + "epoch": 0.1651967435549525, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.5672784447669983, + "kl": 0.0, + "learning_rate": 4.24775707384403e-07, + "loss": 0.0126, + "num_tokens": 17068793.0, + "reward": 0.6499999761581421, + "reward_std": 0.17606817185878754, + "rewards/correctness_reward_func/mean": 0.3500000238418579, + "rewards/correctness_reward_func/std": 0.4358898997306824, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 487 + }, + { + "completion_length": 1676.166748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2966.0, + "completions/max_terminated_length": 2966.0, + "completions/mean_length": 1676.166748046875, + "completions/mean_terminated_length": 1676.166748046875, + "completions/min_length": 652.0, + "completions/min_terminated_length": 652.0, + "epoch": 0.1655359565807327, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6939499378204346, + "kl": 0.0, + "learning_rate": 4.246031746031746e-07, + "loss": -0.0032, + "num_tokens": 17098855.0, + "reward": 0.5708333253860474, + "reward_std": 0.43851161003112793, + "rewards/correctness_reward_func/mean": 0.28333333134651184, + "rewards/correctness_reward_func/std": 0.4217568039894104, + "rewards/format_reward_func/mean": 0.2875000238418579, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 488 + }, + { + "completion_length": 608.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 911.0, + "completions/max_terminated_length": 911.0, + "completions/mean_length": 608.5, + "completions/mean_terminated_length": 608.5, + "completions/min_length": 380.0, + "completions/min_terminated_length": 380.0, + "epoch": 0.16587516960651288, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.13784745648627e-08, + "kl": 0.0, + "learning_rate": 4.2443064182194615e-07, + "loss": 0.0, + "num_tokens": 17118703.0, + "reward": 0.7000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 489 + }, + { + "completion_length": 1851.8333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4297.0, + "completions/max_terminated_length": 4297.0, + "completions/mean_length": 1851.8333740234375, + "completions/mean_terminated_length": 1851.8333740234375, + "completions/min_length": 495.0, + "completions/min_terminated_length": 495.0, + "epoch": 0.16621438263229307, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16395634412765503, + "kl": 0.0, + "learning_rate": 4.2425810904071776e-07, + "loss": -0.0026, + "num_tokens": 17150327.0, + "reward": 1.1666667461395264, + "reward_std": 0.09559705853462219, + "rewards/correctness_reward_func/mean": 0.8666666150093079, + "rewards/correctness_reward_func/std": 0.0984731912612915, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 490 + }, + { + "completion_length": 1038.9167175292969, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2055.0, + "completions/max_terminated_length": 2055.0, + "completions/mean_length": 1038.916748046875, + "completions/mean_terminated_length": 1038.916748046875, + "completions/min_length": 536.0, + "completions/min_terminated_length": 536.0, + "epoch": 0.16655359565807326, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10568089783191681, + "kl": 0.0, + "learning_rate": 4.2408557625948926e-07, + "loss": -0.0016, + "num_tokens": 17176492.0, + "reward": 0.7375000715255737, + "reward_std": 0.0853908583521843, + "rewards/correctness_reward_func/mean": 0.44999995827674866, + "rewards/correctness_reward_func/std": 0.47577688097953796, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 491 + }, + { + "completion_length": 2283.416748046875, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 6078.0, + "completions/mean_length": 2832.5, + "completions/mean_terminated_length": 2491.0, + "completions/min_length": 654.0, + "completions/min_terminated_length": 654.0, + "epoch": 0.16689280868385345, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.4687753915786743, + "kl": NaN, + "learning_rate": 4.2391304347826086e-07, + "loss": -0.0141, + "num_tokens": 17219097.0, + "reward": 0.8083333969116211, + "reward_std": 0.23327383399009705, + "rewards/correctness_reward_func/mean": 0.5333333611488342, + "rewards/correctness_reward_func/std": 0.3938928246498108, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.08660253882408142, + "step": 492 + }, + { + "completion_length": 2379.666748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5512.0, + "completions/max_terminated_length": 5512.0, + "completions/mean_length": 2379.666748046875, + "completions/mean_terminated_length": 2379.666748046875, + "completions/min_length": 583.0, + "completions/min_terminated_length": 583.0, + "epoch": 0.16723202170963364, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5325803756713867, + "kl": 0.0, + "learning_rate": 4.237405106970324e-07, + "loss": 0.0065, + "num_tokens": 17260247.0, + "reward": 0.9500000476837158, + "reward_std": 0.27386125922203064, + "rewards/correctness_reward_func/mean": 0.6499999761581421, + "rewards/correctness_reward_func/std": 0.40113475918769836, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 493 + }, + { + "completion_length": 2168.6666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3947.0, + "completions/max_terminated_length": 3947.0, + "completions/mean_length": 2168.666748046875, + "completions/mean_terminated_length": 2168.666748046875, + "completions/min_length": 1066.0, + "completions/min_terminated_length": 1066.0, + "epoch": 0.16757123473541383, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.7554351927628886e-07, + "kl": 0.0, + "learning_rate": 4.23567977915804e-07, + "loss": 0.0, + "num_tokens": 17298409.0, + "reward": 1.2000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 494 + }, + { + "completion_length": 1945.0001220703125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4646.0, + "completions/max_terminated_length": 4646.0, + "completions/mean_length": 1945.0, + "completions/mean_terminated_length": 1945.0, + "completions/min_length": 657.0, + "completions/min_terminated_length": 657.0, + "epoch": 0.16791044776119404, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.706338761981897e-07, + "kl": 0.0, + "learning_rate": 4.233954451345755e-07, + "loss": 0.0, + "num_tokens": 17336221.0, + "reward": 1.2000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 495 + }, + { + "completion_length": 1856.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4646.0, + "completions/max_terminated_length": 4646.0, + "completions/mean_length": 1856.25, + "completions/mean_terminated_length": 1856.25, + "completions/min_length": 642.0, + "completions/min_terminated_length": 642.0, + "epoch": 0.16824966078697423, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0943993553519249, + "kl": 0.0, + "learning_rate": 4.2322291235334713e-07, + "loss": -0.0045, + "num_tokens": 17374858.0, + "reward": 1.2666666507720947, + "reward_std": 0.08164961636066437, + "rewards/correctness_reward_func/mean": 0.9666666984558105, + "rewards/correctness_reward_func/std": 0.07784988731145859, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 496 + }, + { + "completion_length": 1558.7500610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3671.0, + "completions/max_terminated_length": 3671.0, + "completions/mean_length": 1558.75, + "completions/mean_terminated_length": 1558.75, + "completions/min_length": 586.0, + "completions/min_terminated_length": 586.0, + "epoch": 0.16858887381275442, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.6221547127770464e-07, + "kl": 0.0, + "learning_rate": 4.230503795721187e-07, + "loss": 0.0, + "num_tokens": 17402695.0, + "reward": 0.7000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 497 + }, + { + "completion_length": 3426.666748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6069.0, + "completions/max_terminated_length": 6069.0, + "completions/mean_length": 3426.666748046875, + "completions/mean_terminated_length": 3426.666748046875, + "completions/min_length": 1866.0, + "completions/min_terminated_length": 1866.0, + "epoch": 0.1689280868385346, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6243091821670532, + "kl": 0.0, + "learning_rate": 4.2287784679089023e-07, + "loss": 0.0319, + "num_tokens": 17453313.0, + "reward": 0.4541667103767395, + "reward_std": 0.28881752490997314, + "rewards/correctness_reward_func/mean": 0.1666666716337204, + "rewards/correctness_reward_func/std": 0.3892494738101959, + "rewards/format_reward_func/mean": 0.2875000238418579, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 498 + }, + { + "completion_length": 1164.6667175292969, + "completions/clipped_ratio": 0.5, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 3076.0, + "completions/mean_length": 4459.1669921875, + "completions/mean_terminated_length": 2329.33349609375, + "completions/min_length": 1407.0, + "completions/min_terminated_length": 1407.0, + "epoch": 0.1692672998643148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": NaN, + "learning_rate": 4.227053140096618e-07, + "loss": 0.0, + "num_tokens": 17482115.0, + "reward": 0.15000000596046448, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.0, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.15000000596046448, + "rewards/format_reward_func/std": 0.15666989982128143, + "step": 499 + }, + { + "completion_length": 1079.5000305175781, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2631.0, + "completions/max_terminated_length": 2631.0, + "completions/mean_length": 1079.5, + "completions/mean_terminated_length": 1079.5, + "completions/min_length": 621.0, + "completions/min_terminated_length": 621.0, + "epoch": 0.16960651289009498, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.258025102099054e-08, + "kl": 0.0, + "learning_rate": 4.225327812284334e-07, + "loss": 0.0, + "num_tokens": 17506619.0, + "reward": 0.7000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 500 + }, + { + "completion_length": 1423.0833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2845.0, + "completions/max_terminated_length": 2845.0, + "completions/mean_length": 1423.0833740234375, + "completions/mean_terminated_length": 1423.0833740234375, + "completions/min_length": 978.0, + "completions/min_terminated_length": 978.0, + "epoch": 0.16994572591587517, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.1390148103237152, + "kl": 0.0, + "learning_rate": 4.2236024844720495e-07, + "loss": 0.0015, + "num_tokens": 17536302.0, + "reward": 1.133333444595337, + "reward_std": 0.051639772951602936, + "rewards/correctness_reward_func/mean": 0.8333333134651184, + "rewards/correctness_reward_func/std": 0.07784988731145859, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 501 + }, + { + "completion_length": 1925.9166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3312.0, + "completions/max_terminated_length": 3312.0, + "completions/mean_length": 1925.916748046875, + "completions/mean_terminated_length": 1925.916748046875, + "completions/min_length": 1054.0, + "completions/min_terminated_length": 1054.0, + "epoch": 0.17028493894165536, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.353748923018429e-07, + "kl": 0.0, + "learning_rate": 4.221877156659765e-07, + "loss": 0.0, + "num_tokens": 17568329.0, + "reward": 1.1000001430511475, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 502 + }, + { + "completion_length": 2333.5833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4467.0, + "completions/max_terminated_length": 4467.0, + "completions/mean_length": 2333.58349609375, + "completions/mean_terminated_length": 2333.58349609375, + "completions/min_length": 696.0, + "completions/min_terminated_length": 696.0, + "epoch": 0.17062415196743555, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.10262121260166168, + "kl": 0.0, + "learning_rate": 4.220151828847481e-07, + "loss": -0.0002, + "num_tokens": 17607348.0, + "reward": 1.1500000953674316, + "reward_std": 0.05477223917841911, + "rewards/correctness_reward_func/mean": 0.8500000834465027, + "rewards/correctness_reward_func/std": 0.09045339375734329, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 503 + }, + { + "completion_length": 2920.8334350585938, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 6092.0, + "completions/mean_length": 3469.916748046875, + "completions/mean_terminated_length": 3186.36376953125, + "completions/min_length": 556.0, + "completions/min_terminated_length": 556.0, + "epoch": 0.17096336499321574, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6888859868049622, + "kl": NaN, + "learning_rate": 4.2184265010351966e-07, + "loss": -0.0503, + "num_tokens": 17652208.0, + "reward": 1.0833333730697632, + "reward_std": 0.22770795226097107, + "rewards/correctness_reward_func/mean": 0.7833333015441895, + "rewards/correctness_reward_func/std": 0.26227444410324097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 504 + }, + { + "completion_length": 2582.166748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4693.0, + "completions/max_terminated_length": 4693.0, + "completions/mean_length": 2582.166748046875, + "completions/mean_terminated_length": 2582.166748046875, + "completions/min_length": 887.0, + "completions/min_terminated_length": 887.0, + "epoch": 0.17130257801899593, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.13936969637870789, + "kl": 0.0, + "learning_rate": 4.2167011732229126e-07, + "loss": -0.0029, + "num_tokens": 17696994.0, + "reward": 1.25, + "reward_std": 0.05477223917841911, + "rewards/correctness_reward_func/mean": 0.949999988079071, + "rewards/correctness_reward_func/std": 0.09045339375734329, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 505 + }, + { + "completion_length": 1910.1666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4204.0, + "completions/max_terminated_length": 4204.0, + "completions/mean_length": 1910.166748046875, + "completions/mean_terminated_length": 1910.166748046875, + "completions/min_length": 529.0, + "completions/min_terminated_length": 529.0, + "epoch": 0.17164179104477612, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.7006736397743225, + "kl": 0.0, + "learning_rate": 4.2149758454106276e-07, + "loss": 0.0117, + "num_tokens": 17730788.0, + "reward": 0.9500000476837158, + "reward_std": 0.27386125922203064, + "rewards/correctness_reward_func/mean": 0.6499999761581421, + "rewards/correctness_reward_func/std": 0.40113475918769836, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 506 + }, + { + "completion_length": 1539.416748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3405.0, + "completions/max_terminated_length": 3405.0, + "completions/mean_length": 1539.416748046875, + "completions/mean_terminated_length": 1539.416748046875, + "completions/min_length": 449.0, + "completions/min_terminated_length": 449.0, + "epoch": 0.1719810040705563, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.11347401887178421, + "kl": 0.0, + "learning_rate": 4.2132505175983437e-07, + "loss": -0.0027, + "num_tokens": 17758231.0, + "reward": 1.133333444595337, + "reward_std": 0.051639772951602936, + "rewards/correctness_reward_func/mean": 0.8333333134651184, + "rewards/correctness_reward_func/std": 0.07784988731145859, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 507 + }, + { + "completion_length": 1861.0834350585938, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4365.0, + "completions/max_terminated_length": 4365.0, + "completions/mean_length": 1861.0833740234375, + "completions/mean_terminated_length": 1861.0833740234375, + "completions/min_length": 472.0, + "completions/min_terminated_length": 472.0, + "epoch": 0.1723202170963365, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.16115796566009521, + "kl": 0.0, + "learning_rate": 4.211525189786059e-07, + "loss": 0.0001, + "num_tokens": 17789888.0, + "reward": 1.1500000953674316, + "reward_std": 0.05477222427725792, + "rewards/correctness_reward_func/mean": 0.8499999642372131, + "rewards/correctness_reward_func/std": 0.09045340120792389, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 508 + }, + { + "completion_length": 1629.8333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3168.0, + "completions/max_terminated_length": 3168.0, + "completions/mean_length": 1629.8333740234375, + "completions/mean_terminated_length": 1629.8333740234375, + "completions/min_length": 874.0, + "completions/min_terminated_length": 874.0, + "epoch": 0.17265943012211668, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 4.209799861973775e-07, + "loss": 0.0, + "num_tokens": 17820948.0, + "reward": 0.7999999523162842, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.5, + "rewards/correctness_reward_func/std": 0.5222329497337341, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 509 + }, + { + "completion_length": 1754.2501220703125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3233.0, + "completions/max_terminated_length": 3233.0, + "completions/mean_length": 1754.25, + "completions/mean_terminated_length": 1754.25, + "completions/min_length": 771.0, + "completions/min_terminated_length": 771.0, + "epoch": 0.17299864314789687, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.33984920382499695, + "kl": 0.0, + "learning_rate": 4.2080745341614903e-07, + "loss": 0.01, + "num_tokens": 17851845.0, + "reward": 0.36666667461395264, + "reward_std": 0.1632993221282959, + "rewards/correctness_reward_func/mean": 0.06666667014360428, + "rewards/correctness_reward_func/std": 0.23094011843204498, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 510 + }, + { + "completion_length": 1409.3333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4162.0, + "completions/max_terminated_length": 4162.0, + "completions/mean_length": 1409.3333740234375, + "completions/mean_terminated_length": 1409.3333740234375, + "completions/min_length": 551.0, + "completions/min_terminated_length": 551.0, + "epoch": 0.17333785617367706, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.3745829463005066, + "kl": 0.0, + "learning_rate": 4.2063492063492063e-07, + "loss": 0.0158, + "num_tokens": 17880055.0, + "reward": 0.833333432674408, + "reward_std": 0.20655912160873413, + "rewards/correctness_reward_func/mean": 0.5333333611488342, + "rewards/correctness_reward_func/std": 0.3938928246498108, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 511 + }, + { + "completion_length": 1085.9166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1805.0, + "completions/max_terminated_length": 1805.0, + "completions/mean_length": 1085.916748046875, + "completions/mean_terminated_length": 1085.916748046875, + "completions/min_length": 630.0, + "completions/min_terminated_length": 630.0, + "epoch": 0.17367706919945725, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.05689301714301109, + "kl": 0.0, + "learning_rate": 4.204623878536922e-07, + "loss": -0.0001, + "num_tokens": 17904756.0, + "reward": 1.0875000953674316, + "reward_std": 0.030618613585829735, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 512 + }, + { + "completion_length": 2082.8333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3168.0, + "completions/max_terminated_length": 3168.0, + "completions/mean_length": 2082.83349609375, + "completions/mean_terminated_length": 2082.83349609375, + "completions/min_length": 1306.0, + "completions/min_terminated_length": 1306.0, + "epoch": 0.17401628222523746, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.502258837223053, + "kl": 0.0, + "learning_rate": 4.2028985507246374e-07, + "loss": 0.0137, + "num_tokens": 17941312.0, + "reward": 0.5666667222976685, + "reward_std": 0.20655910670757294, + "rewards/correctness_reward_func/mean": 0.2666666805744171, + "rewards/correctness_reward_func/std": 0.393892765045166, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 513 + }, + { + "completion_length": 1895.9166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3063.0, + "completions/max_terminated_length": 3063.0, + "completions/mean_length": 1895.916748046875, + "completions/mean_terminated_length": 1895.916748046875, + "completions/min_length": 1216.0, + "completions/min_terminated_length": 1216.0, + "epoch": 0.17435549525101765, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.2023741646771668e-07, + "kl": 0.0, + "learning_rate": 4.201173222912353e-07, + "loss": 0.0, + "num_tokens": 17976897.0, + "reward": 0.7000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 514 + }, + { + "completion_length": 3192.2501220703125, + "completions/clipped_ratio": 0.16666666666666663, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 6415.0, + "completions/mean_length": 4290.4169921875, + "completions/mean_terminated_length": 3830.699951171875, + "completions/min_length": 1845.0, + "completions/min_terminated_length": 1845.0, + "epoch": 0.17469470827679784, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8532640933990479, + "kl": NaN, + "learning_rate": 4.199447895100069e-07, + "loss": -0.0693, + "num_tokens": 18027606.0, + "reward": 0.9666666984558105, + "reward_std": 0.3814123868942261, + "rewards/correctness_reward_func/mean": 0.7166666984558105, + "rewards/correctness_reward_func/std": 0.4386618733406067, + "rewards/format_reward_func/mean": 0.25, + "rewards/format_reward_func/std": 0.09770084172487259, + "step": 515 + }, + { + "completion_length": 2581.5001220703125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6207.0, + "completions/max_terminated_length": 6207.0, + "completions/mean_length": 2581.5, + "completions/mean_terminated_length": 2581.5, + "completions/min_length": 869.0, + "completions/min_terminated_length": 869.0, + "epoch": 0.17503392130257803, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8625640869140625, + "kl": 0.0, + "learning_rate": 4.1977225672877845e-07, + "loss": 0.0173, + "num_tokens": 18071898.0, + "reward": 0.8166667222976685, + "reward_std": 0.46741676330566406, + "rewards/correctness_reward_func/mean": 0.5166666507720947, + "rewards/correctness_reward_func/std": 0.4628632962703705, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 516 + }, + { + "completion_length": 1080.3333587646484, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3990.0, + "completions/max_terminated_length": 3990.0, + "completions/mean_length": 1080.3333740234375, + "completions/mean_terminated_length": 1080.3333740234375, + "completions/min_length": 299.0, + "completions/min_terminated_length": 299.0, + "epoch": 0.17537313432835822, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 4.1959972394755e-07, + "loss": 0.0, + "num_tokens": 18096424.0, + "reward": 0.7999999523162842, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.5, + "rewards/correctness_reward_func/std": 0.5222329497337341, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 517 + }, + { + "completion_length": 830.0833587646484, + "completions/clipped_ratio": 0.5, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 2506.0, + "completions/mean_length": 4124.58349609375, + "completions/mean_terminated_length": 1660.166748046875, + "completions/min_length": 945.0, + "completions/min_terminated_length": 945.0, + "epoch": 0.1757123473541384, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": NaN, + "learning_rate": 4.1942719116632156e-07, + "loss": 0.0, + "num_tokens": 18123209.0, + "reward": 0.15000000596046448, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.0, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.15000000596046448, + "rewards/format_reward_func/std": 0.15666989982128143, + "step": 518 + }, + { + "completion_length": 2415.0001220703125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4186.0, + "completions/max_terminated_length": 4186.0, + "completions/mean_length": 2415.0, + "completions/mean_terminated_length": 2415.0, + "completions/min_length": 884.0, + "completions/min_terminated_length": 884.0, + "epoch": 0.1760515603799186, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.772355489038091e-07, + "kl": 0.0, + "learning_rate": 4.1925465838509316e-07, + "loss": 0.0, + "num_tokens": 18164189.0, + "reward": 1.2000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 519 + }, + { + "completion_length": 862.75, + "completions/clipped_ratio": 0.33333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 2617.0, + "completions/mean_length": 3059.08349609375, + "completions/mean_terminated_length": 1294.125, + "completions/min_length": 738.0, + "completions/min_terminated_length": 738.0, + "epoch": 0.17639077340569878, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4193612337112427, + "kl": NaN, + "learning_rate": 4.190821256038647e-07, + "loss": -0.0477, + "num_tokens": 18186080.0, + "reward": 0.7666667699813843, + "reward_std": 0.3356585204601288, + "rewards/correctness_reward_func/mean": 0.5666666626930237, + "rewards/correctness_reward_func/std": 0.42497774958610535, + "rewards/format_reward_func/mean": 0.20000000298023224, + "rewards/format_reward_func/std": 0.14770980179309845, + "step": 520 + }, + { + "completion_length": 2900.5001220703125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4391.0, + "completions/max_terminated_length": 4391.0, + "completions/mean_length": 2900.5, + "completions/mean_terminated_length": 2900.5, + "completions/min_length": 938.0, + "completions/min_terminated_length": 938.0, + "epoch": 0.17672998643147897, + "frac_reward_zero_std": 1.0, + "grad_norm": 4.6848953161315876e-07, + "kl": 0.0, + "learning_rate": 4.1890959282263627e-07, + "loss": 0.0, + "num_tokens": 18233546.0, + "reward": 0.7000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 521 + }, + { + "completion_length": 3361.8333740234375, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5968.0, + "completions/mean_length": 3910.916748046875, + "completions/mean_terminated_length": 3667.45458984375, + "completions/min_length": 949.0, + "completions/min_terminated_length": 949.0, + "epoch": 0.17706919945725916, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.06511241942644119, + "kl": NaN, + "learning_rate": 4.1873706004140787e-07, + "loss": -0.0128, + "num_tokens": 18285468.0, + "reward": 0.6750000715255737, + "reward_std": 0.06123724579811096, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.08660253882408142, + "step": 522 + }, + { + "completion_length": 1127.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3259.0, + "completions/max_terminated_length": 3259.0, + "completions/mean_length": 1127.75, + "completions/mean_terminated_length": 1127.75, + "completions/min_length": 552.0, + "completions/min_terminated_length": 552.0, + "epoch": 0.17740841248303935, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.09431690722703934, + "kl": 0.0, + "learning_rate": 4.185645272601794e-07, + "loss": 0.0045, + "num_tokens": 18310161.0, + "reward": 1.1666667461395264, + "reward_std": 0.05163975805044174, + "rewards/correctness_reward_func/mean": 0.8666666150093079, + "rewards/correctness_reward_func/std": 0.0984731912612915, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 523 + }, + { + "completion_length": 2173.0834350585938, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5945.0, + "completions/mean_length": 2722.166748046875, + "completions/mean_terminated_length": 2370.636474609375, + "completions/min_length": 905.0, + "completions/min_terminated_length": 905.0, + "epoch": 0.17774762550881953, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13693396747112274, + "kl": NaN, + "learning_rate": 4.18391994478951e-07, + "loss": -0.0113, + "num_tokens": 18345670.0, + "reward": 0.7250000834465027, + "reward_std": 0.11600948870182037, + "rewards/correctness_reward_func/mean": 0.44999995827674866, + "rewards/correctness_reward_func/std": 0.47577688097953796, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.08660253882408142, + "step": 524 + }, + { + "completion_length": 1425.1666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3034.0, + "completions/max_terminated_length": 3034.0, + "completions/mean_length": 1425.166748046875, + "completions/mean_terminated_length": 1425.166748046875, + "completions/min_length": 568.0, + "completions/min_terminated_length": 568.0, + "epoch": 0.17808683853459972, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12271206825971603, + "kl": 0.0, + "learning_rate": 4.1821946169772253e-07, + "loss": 0.0042, + "num_tokens": 18373776.0, + "reward": 1.2333333492279053, + "reward_std": 0.0955970510840416, + "rewards/correctness_reward_func/mean": 0.9333333969116211, + "rewards/correctness_reward_func/std": 0.0984731912612915, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 525 + }, + { + "completion_length": 3609.0001220703125, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 6538.0, + "completions/mean_length": 4158.08349609375, + "completions/mean_terminated_length": 3937.091064453125, + "completions/min_length": 863.0, + "completions/min_terminated_length": 863.0, + "epoch": 0.1784260515603799, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.16100013256073, + "kl": NaN, + "learning_rate": 4.1804692891649414e-07, + "loss": -0.0366, + "num_tokens": 18428916.0, + "reward": 0.3583333194255829, + "reward_std": 0.22453653812408447, + "rewards/correctness_reward_func/mean": 0.0833333358168602, + "rewards/correctness_reward_func/std": 0.28867512941360474, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.08660253882408142, + "step": 526 + }, + { + "completion_length": 2267.666748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4891.0, + "completions/max_terminated_length": 4891.0, + "completions/mean_length": 2267.666748046875, + "completions/mean_terminated_length": 2267.666748046875, + "completions/min_length": 1136.0, + "completions/min_terminated_length": 1136.0, + "epoch": 0.1787652645861601, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.5539312958717346, + "kl": 0.0, + "learning_rate": 4.178743961352657e-07, + "loss": 0.0149, + "num_tokens": 18470840.0, + "reward": 1.0500000715255737, + "reward_std": 0.17606817185878754, + "rewards/correctness_reward_func/mean": 0.75, + "rewards/correctness_reward_func/std": 0.24308621883392334, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 527 + }, + { + "completion_length": 666.2500305175781, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1261.0, + "completions/max_terminated_length": 1261.0, + "completions/mean_length": 666.25, + "completions/mean_terminated_length": 666.25, + "completions/min_length": 388.0, + "completions/min_terminated_length": 388.0, + "epoch": 0.1791044776119403, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 4.1770186335403724e-07, + "loss": 0.0, + "num_tokens": 18494627.0, + "reward": 1.2999999523162842, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 1.0, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 528 + }, + { + "completion_length": 1161.5000610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2421.0, + "completions/max_terminated_length": 2421.0, + "completions/mean_length": 1161.5, + "completions/mean_terminated_length": 1161.5, + "completions/min_length": 471.0, + "completions/min_terminated_length": 471.0, + "epoch": 0.17944369063772048, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.6599263119587704e-07, + "kl": 0.0, + "learning_rate": 4.175293305728088e-07, + "loss": 0.0, + "num_tokens": 18518861.0, + "reward": 0.7000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 529 + }, + { + "completion_length": 921.0000305175781, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1732.0, + "completions/max_terminated_length": 1732.0, + "completions/mean_length": 921.0, + "completions/mean_terminated_length": 921.0, + "completions/min_length": 354.0, + "completions/min_terminated_length": 354.0, + "epoch": 0.17978290366350066, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.20871922373771667, + "kl": 0.0, + "learning_rate": 4.173567977915804e-07, + "loss": -0.0013, + "num_tokens": 18541703.0, + "reward": 0.3541666865348816, + "reward_std": 0.17205862700939178, + "rewards/correctness_reward_func/mean": 0.06666667014360428, + "rewards/correctness_reward_func/std": 0.23094011843204498, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 530 + }, + { + "completion_length": 1214.7500305175781, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4228.0, + "completions/max_terminated_length": 4228.0, + "completions/mean_length": 1214.75, + "completions/mean_terminated_length": 1214.75, + "completions/min_length": 581.0, + "completions/min_terminated_length": 581.0, + "epoch": 0.18012211668928088, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.06504378467798233, + "kl": 0.0, + "learning_rate": 4.171842650103519e-07, + "loss": -0.0003, + "num_tokens": 18563744.0, + "reward": 0.7749999761581421, + "reward_std": 0.03872983902692795, + "rewards/correctness_reward_func/mean": 0.5, + "rewards/correctness_reward_func/std": 0.5222329497337341, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.05838742107152939, + "step": 531 + }, + { + "completion_length": 1132.5000305175781, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1858.0, + "completions/max_terminated_length": 1858.0, + "completions/mean_length": 1132.5, + "completions/mean_terminated_length": 1132.5, + "completions/min_length": 598.0, + "completions/min_terminated_length": 598.0, + "epoch": 0.18046132971506107, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.329216783891752e-07, + "kl": 0.0, + "learning_rate": 4.170117322291235e-07, + "loss": 0.0, + "num_tokens": 18589352.0, + "reward": 1.1000001430511475, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 532 + }, + { + "completion_length": 701.6666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 918.0, + "completions/max_terminated_length": 918.0, + "completions/mean_length": 701.6666870117188, + "completions/mean_terminated_length": 701.6666870117188, + "completions/min_length": 510.0, + "completions/min_terminated_length": 510.0, + "epoch": 0.18080054274084126, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.12558390200138092, + "kl": 0.0, + "learning_rate": 4.1683919944789506e-07, + "loss": -0.0, + "num_tokens": 18610486.0, + "reward": 0.7166666984558105, + "reward_std": 0.040824826806783676, + "rewards/correctness_reward_func/mean": 0.4166666567325592, + "rewards/correctness_reward_func/std": 0.4386619031429291, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 533 + }, + { + "completion_length": 1081.6666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1984.0, + "completions/max_terminated_length": 1984.0, + "completions/mean_length": 1081.666748046875, + "completions/mean_terminated_length": 1081.666748046875, + "completions/min_length": 496.0, + "completions/min_terminated_length": 496.0, + "epoch": 0.18113975576662145, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.09250768274068832, + "kl": 0.0, + "learning_rate": 4.1666666666666667e-07, + "loss": -0.0001, + "num_tokens": 18636492.0, + "reward": 0.75, + "reward_std": 0.05477223917841911, + "rewards/correctness_reward_func/mean": 0.44999995827674866, + "rewards/correctness_reward_func/std": 0.47577688097953796, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 534 + }, + { + "completion_length": 2280.8333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4509.0, + "completions/max_terminated_length": 4509.0, + "completions/mean_length": 2280.83349609375, + "completions/mean_terminated_length": 2280.83349609375, + "completions/min_length": 819.0, + "completions/min_terminated_length": 819.0, + "epoch": 0.18147896879240163, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 4.164941338854382e-07, + "loss": 0.0, + "num_tokens": 18671170.0, + "reward": 0.7999999523162842, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.5, + "rewards/correctness_reward_func/std": 0.5222329497337341, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 535 + }, + { + "completion_length": 1914.8333740234375, + "completions/clipped_ratio": 0.16666666666666663, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 4516.0, + "completions/mean_length": 3013.0, + "completions/mean_terminated_length": 2297.800048828125, + "completions/min_length": 508.0, + "completions/min_terminated_length": 508.0, + "epoch": 0.18181818181818182, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.438634991645813, + "kl": NaN, + "learning_rate": 4.1632160110420977e-07, + "loss": -0.028, + "num_tokens": 18707870.0, + "reward": 1.0499999523162842, + "reward_std": 0.3872982859611511, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.38138505816459656, + "rewards/format_reward_func/mean": 0.25, + "rewards/format_reward_func/std": 0.11677484214305878, + "step": 536 + }, + { + "completion_length": 2529.666748046875, + "completions/clipped_ratio": 0.25, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5630.0, + "completions/mean_length": 4176.9169921875, + "completions/mean_terminated_length": 3372.888916015625, + "completions/min_length": 2035.0, + "completions/min_terminated_length": 2035.0, + "epoch": 0.182157394843962, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.38183891773223877, + "kl": NaN, + "learning_rate": 4.161490683229814e-07, + "loss": -0.0232, + "num_tokens": 18749434.0, + "reward": 0.6375000476837158, + "reward_std": 0.07373940199613571, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.23750001192092896, + "rewards/format_reward_func/std": 0.11894422769546509, + "step": 537 + }, + { + "completion_length": 2225.3333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4071.0, + "completions/max_terminated_length": 4071.0, + "completions/mean_length": 2225.33349609375, + "completions/mean_terminated_length": 2225.33349609375, + "completions/min_length": 822.0, + "completions/min_terminated_length": 822.0, + "epoch": 0.1824966078697422, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.5497386129936785e-07, + "kl": 0.0, + "learning_rate": 4.1597653554175293e-07, + "loss": 0.0, + "num_tokens": 18788006.0, + "reward": 0.7000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 538 + }, + { + "completion_length": 1238.0833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3105.0, + "completions/max_terminated_length": 3105.0, + "completions/mean_length": 1238.0833740234375, + "completions/mean_terminated_length": 1238.0833740234375, + "completions/min_length": 560.0, + "completions/min_terminated_length": 560.0, + "epoch": 0.1828358208955224, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.6287909804523224e-07, + "kl": 0.0, + "learning_rate": 4.158040027605245e-07, + "loss": 0.0, + "num_tokens": 18813957.0, + "reward": 1.1000001430511475, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 539 + }, + { + "completion_length": 1780.666748046875, + "completions/clipped_ratio": 0.33333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 6236.0, + "completions/mean_length": 3977.0, + "completions/mean_terminated_length": 2671.0, + "completions/min_length": 490.0, + "completions/min_terminated_length": 490.0, + "epoch": 0.18317503392130258, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5980116128921509, + "kl": NaN, + "learning_rate": 4.1563146997929604e-07, + "loss": -0.0649, + "num_tokens": 18847355.0, + "reward": 0.7333333492279053, + "reward_std": 0.27224498987197876, + "rewards/correctness_reward_func/mean": 0.5333333015441895, + "rewards/correctness_reward_func/std": 0.47736650705337524, + "rewards/format_reward_func/mean": 0.20000000298023224, + "rewards/format_reward_func/std": 0.14770980179309845, + "step": 540 + }, + { + "completion_length": 2010.916748046875, + "completions/clipped_ratio": 0.25, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5844.0, + "completions/mean_length": 3658.166748046875, + "completions/mean_terminated_length": 2681.22216796875, + "completions/min_length": 1292.0, + "completions/min_terminated_length": 1292.0, + "epoch": 0.18351424694708277, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.284682750701904, + "kl": NaN, + "learning_rate": 4.1545893719806764e-07, + "loss": -0.0252, + "num_tokens": 18885052.0, + "reward": 0.6750000715255737, + "reward_std": 0.13693061470985413, + "rewards/correctness_reward_func/mean": 0.44999995827674866, + "rewards/correctness_reward_func/std": 0.47577688097953796, + "rewards/format_reward_func/mean": 0.22500000894069672, + "rewards/format_reward_func/std": 0.13568010926246643, + "step": 541 + }, + { + "completion_length": 2788.0833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5735.0, + "completions/max_terminated_length": 5735.0, + "completions/mean_length": 2788.08349609375, + "completions/mean_terminated_length": 2788.08349609375, + "completions/min_length": 794.0, + "completions/min_terminated_length": 794.0, + "epoch": 0.18385345997286295, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6677839756011963, + "kl": 0.0, + "learning_rate": 4.1528640441683914e-07, + "loss": -0.0025, + "num_tokens": 18932045.0, + "reward": 1.066666841506958, + "reward_std": 0.2168930023908615, + "rewards/correctness_reward_func/mean": 0.7666667103767395, + "rewards/correctness_reward_func/std": 0.25346091389656067, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 542 + }, + { + "completion_length": 1617.0001220703125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4218.0, + "completions/max_terminated_length": 4218.0, + "completions/mean_length": 1617.0, + "completions/mean_terminated_length": 1617.0, + "completions/min_length": 701.0, + "completions/min_terminated_length": 701.0, + "epoch": 0.18419267299864314, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.618871388283878e-08, + "kl": 0.0, + "learning_rate": 4.1511387163561075e-07, + "loss": 0.0, + "num_tokens": 18965921.0, + "reward": 1.2000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 543 + }, + { + "completion_length": 1418.6666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3041.0, + "completions/max_terminated_length": 3041.0, + "completions/mean_length": 1418.666748046875, + "completions/mean_terminated_length": 1418.666748046875, + "completions/min_length": 660.0, + "completions/min_terminated_length": 660.0, + "epoch": 0.18453188602442333, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.2889324724674225, + "kl": 0.0, + "learning_rate": 4.149413388543823e-07, + "loss": 0.0026, + "num_tokens": 18996343.0, + "reward": 0.833333432674408, + "reward_std": 0.20655912160873413, + "rewards/correctness_reward_func/mean": 0.5333333611488342, + "rewards/correctness_reward_func/std": 0.3938928246498108, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 544 + }, + { + "completion_length": 1246.3333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2340.0, + "completions/max_terminated_length": 2340.0, + "completions/mean_length": 1246.3333740234375, + "completions/mean_terminated_length": 1246.3333740234375, + "completions/min_length": 633.0, + "completions/min_terminated_length": 633.0, + "epoch": 0.18487109905020352, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.43304693698883057, + "kl": 0.0, + "learning_rate": 4.147688060731539e-07, + "loss": 0.0088, + "num_tokens": 19020623.0, + "reward": 1.1166667938232422, + "reward_std": 0.20412413775920868, + "rewards/correctness_reward_func/mean": 0.8166666030883789, + "rewards/correctness_reward_func/std": 0.27579087018966675, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 545 + }, + { + "completion_length": 1579.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4176.0, + "completions/max_terminated_length": 4176.0, + "completions/mean_length": 1579.0, + "completions/mean_terminated_length": 1579.0, + "completions/min_length": 531.0, + "completions/min_terminated_length": 531.0, + "epoch": 0.1852103120759837, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.227987891956218e-07, + "kl": 0.0, + "learning_rate": 4.145962732919254e-07, + "loss": 0.0, + "num_tokens": 19051523.0, + "reward": 1.1000001430511475, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 546 + }, + { + "completion_length": 2309.0001220703125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6122.0, + "completions/max_terminated_length": 6122.0, + "completions/mean_length": 2309.0, + "completions/mean_terminated_length": 2309.0, + "completions/min_length": 819.0, + "completions/min_terminated_length": 819.0, + "epoch": 0.1855495251017639, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 4.14423740510697e-07, + "loss": 0.0, + "num_tokens": 19088027.0, + "reward": 1.2999999523162842, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 1.0, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 547 + }, + { + "completion_length": 1715.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3856.0, + "completions/max_terminated_length": 3856.0, + "completions/mean_length": 1715.0, + "completions/mean_terminated_length": 1715.0, + "completions/min_length": 691.0, + "completions/min_terminated_length": 691.0, + "epoch": 0.18588873812754408, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.35062411427497864, + "kl": 0.0, + "learning_rate": 4.1425120772946856e-07, + "loss": -0.0015, + "num_tokens": 19123619.0, + "reward": 0.8833333849906921, + "reward_std": 0.26133137941360474, + "rewards/correctness_reward_func/mean": 0.5833333134651184, + "rewards/correctness_reward_func/std": 0.4386619031429291, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 548 + }, + { + "completion_length": 1286.8333740234375, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 3200.0, + "completions/mean_length": 1835.916748046875, + "completions/mean_terminated_length": 1403.8182373046875, + "completions/min_length": 492.0, + "completions/min_terminated_length": 492.0, + "epoch": 0.1862279511533243, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7429186701774597, + "kl": NaN, + "learning_rate": 4.1407867494824017e-07, + "loss": -0.0282, + "num_tokens": 19155267.0, + "reward": 0.8083333373069763, + "reward_std": 0.5039968490600586, + "rewards/correctness_reward_func/mean": 0.5333333611488342, + "rewards/correctness_reward_func/std": 0.47736650705337524, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.08660253882408142, + "step": 549 + }, + { + "completion_length": 2776.08349609375, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 6261.0, + "completions/mean_length": 3325.166748046875, + "completions/mean_terminated_length": 3028.45458984375, + "completions/min_length": 966.0, + "completions/min_terminated_length": 966.0, + "epoch": 0.1865671641791045, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.08030923455953598, + "kl": NaN, + "learning_rate": 4.1390614216701167e-07, + "loss": -0.0084, + "num_tokens": 19195720.0, + "reward": 0.6875001788139343, + "reward_std": 0.03061862289905548, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.2875000238418579, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 550 + }, + { + "completion_length": 2092.5833740234375, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5555.0, + "completions/mean_length": 2641.666748046875, + "completions/mean_terminated_length": 2282.818359375, + "completions/min_length": 431.0, + "completions/min_terminated_length": 431.0, + "epoch": 0.18690637720488468, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.49874621629714966, + "kl": NaN, + "learning_rate": 4.137336093857833e-07, + "loss": -0.0328, + "num_tokens": 19236557.0, + "reward": 1.1000001430511475, + "reward_std": 0.23664319515228271, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.38138505816459656, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 551 + }, + { + "completion_length": 2317.5833740234375, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 6479.0, + "completions/mean_length": 2866.666748046875, + "completions/mean_terminated_length": 2528.272705078125, + "completions/min_length": 678.0, + "completions/min_terminated_length": 678.0, + "epoch": 0.18724559023066487, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.5327326059341431, + "kl": NaN, + "learning_rate": 4.135610766045549e-07, + "loss": -0.0418, + "num_tokens": 19275342.0, + "reward": 0.6916667819023132, + "reward_std": 0.26536139845848083, + "rewards/correctness_reward_func/mean": 0.4166666567325592, + "rewards/correctness_reward_func/std": 0.5149286985397339, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.08660253882408142, + "step": 552 + }, + { + "completion_length": 1518.4166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3452.0, + "completions/max_terminated_length": 3452.0, + "completions/mean_length": 1518.416748046875, + "completions/mean_terminated_length": 1518.416748046875, + "completions/min_length": 456.0, + "completions/min_terminated_length": 456.0, + "epoch": 0.18758480325644505, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5784144401550293, + "kl": 0.0, + "learning_rate": 4.1338854382332643e-07, + "loss": -0.0123, + "num_tokens": 19305695.0, + "reward": 0.7833334803581238, + "reward_std": 0.20412415266036987, + "rewards/correctness_reward_func/mean": 0.4833333492279053, + "rewards/correctness_reward_func/std": 0.43029236793518066, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 553 + }, + { + "completion_length": 1451.5833740234375, + "completions/clipped_ratio": 0.16666666666666663, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5332.0, + "completions/mean_length": 2549.75, + "completions/mean_terminated_length": 1741.9000244140625, + "completions/min_length": 495.0, + "completions/min_terminated_length": 495.0, + "epoch": 0.18792401628222524, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5653297901153564, + "kl": NaN, + "learning_rate": 4.13216011042098e-07, + "loss": -0.0303, + "num_tokens": 19336656.0, + "reward": 0.9125000834465027, + "reward_std": 0.31922924518585205, + "rewards/correctness_reward_func/mean": 0.6499999761581421, + "rewards/correctness_reward_func/std": 0.40113475918769836, + "rewards/format_reward_func/mean": 0.26250001788139343, + "rewards/format_reward_func/std": 0.09323723614215851, + "step": 554 + }, + { + "completion_length": 3387.5833740234375, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5996.0, + "completions/mean_length": 3936.666748046875, + "completions/mean_terminated_length": 3695.545654296875, + "completions/min_length": 2256.0, + "completions/min_terminated_length": 2256.0, + "epoch": 0.18826322930800543, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1428702175617218, + "kl": NaN, + "learning_rate": 4.1304347826086954e-07, + "loss": -0.0036, + "num_tokens": 19388881.0, + "reward": 0.7541667819023132, + "reward_std": 0.08225837349891663, + "rewards/correctness_reward_func/mean": 0.46666666865348816, + "rewards/correctness_reward_func/std": 0.4923659563064575, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 555 + }, + { + "completion_length": 723.4166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1403.0, + "completions/max_terminated_length": 1403.0, + "completions/mean_length": 723.4166870117188, + "completions/mean_terminated_length": 723.4166870117188, + "completions/min_length": 500.0, + "completions/min_terminated_length": 500.0, + "epoch": 0.18860244233378562, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.9308046717014804e-07, + "kl": 0.0, + "learning_rate": 4.1287094547964115e-07, + "loss": 0.0, + "num_tokens": 19411362.0, + "reward": 1.1000001430511475, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 556 + }, + { + "completion_length": 995.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1785.0, + "completions/max_terminated_length": 1785.0, + "completions/mean_length": 995.0, + "completions/mean_terminated_length": 995.0, + "completions/min_length": 445.0, + "completions/min_terminated_length": 445.0, + "epoch": 0.1889416553595658, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.172017670749483e-07, + "kl": 0.0, + "learning_rate": 4.1269841269841265e-07, + "loss": 0.0, + "num_tokens": 19436022.0, + "reward": 1.1000001430511475, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 557 + }, + { + "completion_length": 2856.8333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4397.0, + "completions/max_terminated_length": 4397.0, + "completions/mean_length": 2856.83349609375, + "completions/mean_terminated_length": 2856.83349609375, + "completions/min_length": 906.0, + "completions/min_terminated_length": 906.0, + "epoch": 0.189280868385346, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 4.1252587991718425e-07, + "loss": 0.0, + "num_tokens": 19481284.0, + "reward": 0.7999999523162842, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.5, + "rewards/correctness_reward_func/std": 0.5222329497337341, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 558 + }, + { + "completion_length": 1833.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5297.0, + "completions/max_terminated_length": 5297.0, + "completions/mean_length": 1833.5, + "completions/mean_terminated_length": 1833.5, + "completions/min_length": 596.0, + "completions/min_terminated_length": 596.0, + "epoch": 0.18962008141112618, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.08774584531784058, + "kl": 0.0, + "learning_rate": 4.123533471359558e-07, + "loss": -0.0012, + "num_tokens": 19518106.0, + "reward": 1.1166667938232422, + "reward_std": 0.040824826806783676, + "rewards/correctness_reward_func/mean": 0.8166666626930237, + "rewards/correctness_reward_func/std": 0.05773502215743065, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 559 + }, + { + "completion_length": 1456.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2801.0, + "completions/max_terminated_length": 2801.0, + "completions/mean_length": 1456.75, + "completions/mean_terminated_length": 1456.75, + "completions/min_length": 743.0, + "completions/min_terminated_length": 743.0, + "epoch": 0.18995929443690637, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.4802844822406769, + "kl": 0.0, + "learning_rate": 4.121808143547274e-07, + "loss": 0.0104, + "num_tokens": 19546603.0, + "reward": 0.6666667461395264, + "reward_std": 0.18618986010551453, + "rewards/correctness_reward_func/mean": 0.36666664481163025, + "rewards/correctness_reward_func/std": 0.45792677998542786, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 560 + }, + { + "completion_length": 1429.5833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3128.0, + "completions/max_terminated_length": 3128.0, + "completions/mean_length": 1429.5833740234375, + "completions/mean_terminated_length": 1429.5833740234375, + "completions/min_length": 396.0, + "completions/min_terminated_length": 396.0, + "epoch": 0.19029850746268656, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.36741065979003906, + "kl": 0.0, + "learning_rate": 4.120082815734989e-07, + "loss": 0.0001, + "num_tokens": 19574510.0, + "reward": 0.595833420753479, + "reward_std": 0.20437853038311005, + "rewards/correctness_reward_func/mean": 0.3333333432674408, + "rewards/correctness_reward_func/std": 0.4119429290294647, + "rewards/format_reward_func/mean": 0.26250001788139343, + "rewards/format_reward_func/std": 0.06784005463123322, + "step": 561 + }, + { + "completion_length": 1721.0833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3282.0, + "completions/max_terminated_length": 3282.0, + "completions/mean_length": 1721.0833740234375, + "completions/mean_terminated_length": 1721.0833740234375, + "completions/min_length": 762.0, + "completions/min_terminated_length": 762.0, + "epoch": 0.19063772048846675, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.8861383921375818e-07, + "kl": 0.0, + "learning_rate": 4.118357487922705e-07, + "loss": 0.0, + "num_tokens": 19607967.0, + "reward": 1.1000001430511475, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 562 + }, + { + "completion_length": 1103.0000305175781, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 1976.0, + "completions/mean_length": 1652.0833740234375, + "completions/mean_terminated_length": 1203.272705078125, + "completions/min_length": 562.0, + "completions/min_terminated_length": 562.0, + "epoch": 0.19097693351424694, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.17974478006362915, + "kl": NaN, + "learning_rate": 4.1166321601104207e-07, + "loss": -0.0182, + "num_tokens": 19638069.0, + "reward": 1.058333396911621, + "reward_std": 0.25380438566207886, + "rewards/correctness_reward_func/mean": 0.7833333015441895, + "rewards/correctness_reward_func/std": 0.26227444410324097, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.08660253882408142, + "step": 563 + }, + { + "completion_length": 660.5000305175781, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1087.0, + "completions/max_terminated_length": 1087.0, + "completions/mean_length": 660.5, + "completions/mean_terminated_length": 660.5, + "completions/min_length": 415.0, + "completions/min_terminated_length": 415.0, + "epoch": 0.19131614654002713, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.9672070550313947e-07, + "kl": 0.0, + "learning_rate": 4.114906832298137e-07, + "loss": 0.0, + "num_tokens": 19662405.0, + "reward": 1.1000001430511475, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 564 + }, + { + "completion_length": 913.4166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2214.0, + "completions/max_terminated_length": 2214.0, + "completions/mean_length": 913.4166870117188, + "completions/mean_terminated_length": 913.4166870117188, + "completions/min_length": 504.0, + "completions/min_terminated_length": 504.0, + "epoch": 0.19165535956580732, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.26728707551956177, + "kl": 0.0, + "learning_rate": 4.113181504485852e-07, + "loss": -0.0042, + "num_tokens": 19688930.0, + "reward": 1.0333335399627686, + "reward_std": 0.1632993221282959, + "rewards/correctness_reward_func/mean": 0.7333333492279053, + "rewards/correctness_reward_func/std": 0.23094011843204498, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 565 + }, + { + "completion_length": 621.0000305175781, + "completions/clipped_ratio": 0.0, + "completions/max_length": 913.0, + "completions/max_terminated_length": 913.0, + "completions/mean_length": 621.0, + "completions/mean_terminated_length": 621.0, + "completions/min_length": 384.0, + "completions/min_terminated_length": 384.0, + "epoch": 0.1919945725915875, + "frac_reward_zero_std": 1.0, + "grad_norm": 8.458977163172676e-08, + "kl": 0.0, + "learning_rate": 4.111456176673568e-07, + "loss": 0.0, + "num_tokens": 19708724.0, + "reward": 1.2000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 566 + }, + { + "completion_length": 2206.0001220703125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4598.0, + "completions/max_terminated_length": 4598.0, + "completions/mean_length": 2206.0, + "completions/mean_terminated_length": 2206.0, + "completions/min_length": 614.0, + "completions/min_terminated_length": 614.0, + "epoch": 0.19233378561736772, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29058995842933655, + "kl": 0.0, + "learning_rate": 4.109730848861284e-07, + "loss": -0.0083, + "num_tokens": 19746638.0, + "reward": 1.1166666746139526, + "reward_std": 0.20412413775920868, + "rewards/correctness_reward_func/mean": 0.8166666030883789, + "rewards/correctness_reward_func/std": 0.27579087018966675, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 567 + }, + { + "completion_length": 1693.0834350585938, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6508.0, + "completions/max_terminated_length": 6508.0, + "completions/mean_length": 1693.0833740234375, + "completions/mean_terminated_length": 1693.0833740234375, + "completions/min_length": 346.0, + "completions/min_terminated_length": 346.0, + "epoch": 0.1926729986431479, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.44586482644081116, + "kl": 0.0, + "learning_rate": 4.108005521048999e-07, + "loss": -0.0105, + "num_tokens": 19778715.0, + "reward": 0.7166666984558105, + "reward_std": 0.20412413775920868, + "rewards/correctness_reward_func/mean": 0.4166666567325592, + "rewards/correctness_reward_func/std": 0.5149286985397339, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 568 + }, + { + "completion_length": 2577.2501220703125, + "completions/clipped_ratio": 0.16666666666666663, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 6125.0, + "completions/mean_length": 3675.416748046875, + "completions/mean_terminated_length": 3092.699951171875, + "completions/min_length": 1502.0, + "completions/min_terminated_length": 1502.0, + "epoch": 0.1930122116689281, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7111157774925232, + "kl": NaN, + "learning_rate": 4.106280193236715e-07, + "loss": -0.0213, + "num_tokens": 19822152.0, + "reward": 0.44166669249534607, + "reward_std": 0.31943613290786743, + "rewards/correctness_reward_func/mean": 0.1666666716337204, + "rewards/correctness_reward_func/std": 0.38924944400787354, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.08660254627466202, + "step": 569 + }, + { + "completion_length": 1982.3333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2881.0, + "completions/max_terminated_length": 2881.0, + "completions/mean_length": 1982.3333740234375, + "completions/mean_terminated_length": 1982.3333740234375, + "completions/min_length": 1068.0, + "completions/min_terminated_length": 1068.0, + "epoch": 0.19335142469470828, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 4.1045548654244304e-07, + "loss": 0.0, + "num_tokens": 19858936.0, + "reward": 0.7999999523162842, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.5, + "rewards/correctness_reward_func/std": 0.5222329497337341, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 570 + }, + { + "completion_length": 1354.4166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2324.0, + "completions/max_terminated_length": 2324.0, + "completions/mean_length": 1354.416748046875, + "completions/mean_terminated_length": 1354.416748046875, + "completions/min_length": 844.0, + "completions/min_terminated_length": 844.0, + "epoch": 0.19369063772048847, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.0446248666085012e-07, + "kl": 0.0, + "learning_rate": 4.1028295376121465e-07, + "loss": 0.0, + "num_tokens": 19888419.0, + "reward": 1.2000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 571 + }, + { + "completion_length": 2613.5001220703125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4610.0, + "completions/max_terminated_length": 4610.0, + "completions/mean_length": 2613.5, + "completions/mean_terminated_length": 2613.5, + "completions/min_length": 714.0, + "completions/min_terminated_length": 714.0, + "epoch": 0.19402985074626866, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.4929959490928013e-07, + "kl": 0.0, + "learning_rate": 4.1011042097998615e-07, + "loss": 0.0, + "num_tokens": 19931457.0, + "reward": 1.2000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 572 + }, + { + "completion_length": 1108.8333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2757.0, + "completions/max_terminated_length": 2757.0, + "completions/mean_length": 1108.8333740234375, + "completions/mean_terminated_length": 1108.8333740234375, + "completions/min_length": 438.0, + "completions/min_terminated_length": 438.0, + "epoch": 0.19436906377204885, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.05309825763106346, + "kl": 0.0, + "learning_rate": 4.0993788819875776e-07, + "loss": -0.0001, + "num_tokens": 19959157.0, + "reward": 1.2166666984558105, + "reward_std": 0.040824826806783676, + "rewards/correctness_reward_func/mean": 0.9166666865348816, + "rewards/correctness_reward_func/std": 0.10298573225736618, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 573 + }, + { + "completion_length": 811.0000610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2012.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 811.0, + "completions/mean_terminated_length": 811.0, + "completions/min_length": 361.0, + "completions/min_terminated_length": 361.0, + "epoch": 0.19470827679782904, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.2551961839199066, + "kl": 0.0, + "learning_rate": 4.097653554175293e-07, + "loss": -0.001, + "num_tokens": 19981459.0, + "reward": 1.0333335399627686, + "reward_std": 0.1632993221282959, + "rewards/correctness_reward_func/mean": 0.7333333492279053, + "rewards/correctness_reward_func/std": 0.2309401035308838, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 574 + }, + { + "completion_length": 1731.6666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3343.0, + "completions/max_terminated_length": 3343.0, + "completions/mean_length": 1731.666748046875, + "completions/mean_terminated_length": 1731.666748046875, + "completions/min_length": 636.0, + "completions/min_terminated_length": 636.0, + "epoch": 0.19504748982360923, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12224812805652618, + "kl": 0.0, + "learning_rate": 4.095928226363009e-07, + "loss": 0.0014, + "num_tokens": 20017251.0, + "reward": 1.2000000476837158, + "reward_std": 0.08164963126182556, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 575 + }, + { + "completion_length": 2264.916748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4903.0, + "completions/max_terminated_length": 4903.0, + "completions/mean_length": 2264.916748046875, + "completions/mean_terminated_length": 2264.916748046875, + "completions/min_length": 673.0, + "completions/min_terminated_length": 673.0, + "epoch": 0.19538670284938942, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.07465171068906784, + "kl": 0.0, + "learning_rate": 4.094202898550724e-07, + "loss": 0.0004, + "num_tokens": 20059640.0, + "reward": 1.133333444595337, + "reward_std": 0.05163976177573204, + "rewards/correctness_reward_func/mean": 0.8333333134651184, + "rewards/correctness_reward_func/std": 0.07784988731145859, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 576 + }, + { + "completion_length": 711.9166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1438.0, + "completions/max_terminated_length": 1438.0, + "completions/mean_length": 711.9166870117188, + "completions/mean_terminated_length": 711.9166870117188, + "completions/min_length": 317.0, + "completions/min_terminated_length": 317.0, + "epoch": 0.1957259158751696, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.3539010286331177, + "kl": 0.0, + "learning_rate": 4.09247757073844e-07, + "loss": -0.0026, + "num_tokens": 20076679.0, + "reward": 1.000000238418579, + "reward_std": 0.23664319515228271, + "rewards/correctness_reward_func/mean": 0.699999988079071, + "rewards/correctness_reward_func/std": 0.3357488214969635, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 577 + }, + { + "completion_length": 2636.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5236.0, + "completions/max_terminated_length": 5236.0, + "completions/mean_length": 2636.0, + "completions/mean_terminated_length": 2636.0, + "completions/min_length": 934.0, + "completions/min_terminated_length": 934.0, + "epoch": 0.1960651289009498, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1721926927566528, + "kl": 0.0, + "learning_rate": 4.0907522429261557e-07, + "loss": 0.0262, + "num_tokens": 20117005.0, + "reward": 0.8041667342185974, + "reward_std": 0.43039870262145996, + "rewards/correctness_reward_func/mean": 0.5166666507720947, + "rewards/correctness_reward_func/std": 0.4628632962703705, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 578 + }, + { + "completion_length": 1920.5833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4433.0, + "completions/max_terminated_length": 4433.0, + "completions/mean_length": 1920.5833740234375, + "completions/mean_terminated_length": 1920.5833740234375, + "completions/min_length": 811.0, + "completions/min_terminated_length": 811.0, + "epoch": 0.19640434192672998, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.5124653577804565, + "kl": 0.0, + "learning_rate": 4.089026915113871e-07, + "loss": -0.0106, + "num_tokens": 20153522.0, + "reward": 1.0833332538604736, + "reward_std": 0.222860187292099, + "rewards/correctness_reward_func/mean": 0.7833333015441895, + "rewards/correctness_reward_func/std": 0.37618499994277954, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 579 + }, + { + "completion_length": 1807.25, + "completions/clipped_ratio": 0.25, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 6471.0, + "completions/mean_length": 3454.5, + "completions/mean_terminated_length": 2409.666748046875, + "completions/min_length": 540.0, + "completions/min_terminated_length": 540.0, + "epoch": 0.19674355495251017, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.17243024706840515, + "kl": NaN, + "learning_rate": 4.087301587301587e-07, + "loss": -0.0296, + "num_tokens": 20189837.0, + "reward": 0.6125000715255737, + "reward_std": 0.07373940199613571, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.21250002086162567, + "rewards/format_reward_func/std": 0.13505050539970398, + "step": 580 + }, + { + "completion_length": 2128.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4383.0, + "completions/max_terminated_length": 4383.0, + "completions/mean_length": 2128.25, + "completions/mean_terminated_length": 2128.25, + "completions/min_length": 700.0, + "completions/min_terminated_length": 700.0, + "epoch": 0.19708276797829036, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 4.085576259489303e-07, + "loss": 0.0, + "num_tokens": 20230106.0, + "reward": 0.30000001192092896, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.0, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 581 + }, + { + "completion_length": 2194.8333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4675.0, + "completions/max_terminated_length": 4675.0, + "completions/mean_length": 2194.83349609375, + "completions/mean_terminated_length": 2194.83349609375, + "completions/min_length": 799.0, + "completions/min_terminated_length": 799.0, + "epoch": 0.19742198100407055, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6451249122619629, + "kl": 0.0, + "learning_rate": 4.083850931677019e-07, + "loss": -0.0096, + "num_tokens": 20267946.0, + "reward": 1.070833444595337, + "reward_std": 0.29760777950286865, + "rewards/correctness_reward_func/mean": 0.7833333015441895, + "rewards/correctness_reward_func/std": 0.37618499994277954, + "rewards/format_reward_func/mean": 0.2875000238418579, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 582 + }, + { + "completion_length": 1643.916748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3262.0, + "completions/max_terminated_length": 3262.0, + "completions/mean_length": 1643.916748046875, + "completions/mean_terminated_length": 1643.916748046875, + "completions/min_length": 862.0, + "completions/min_terminated_length": 862.0, + "epoch": 0.19776119402985073, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13853637874126434, + "kl": 0.0, + "learning_rate": 4.082125603864734e-07, + "loss": -0.0048, + "num_tokens": 20298029.0, + "reward": 1.2208333015441895, + "reward_std": 0.10064341127872467, + "rewards/correctness_reward_func/mean": 0.9333333373069763, + "rewards/correctness_reward_func/std": 0.0984731912612915, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 583 + }, + { + "completion_length": 2544.166748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5490.0, + "completions/max_terminated_length": 5490.0, + "completions/mean_length": 2544.166748046875, + "completions/mean_terminated_length": 2544.166748046875, + "completions/min_length": 862.0, + "completions/min_terminated_length": 862.0, + "epoch": 0.19810040705563092, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.7164510488510132, + "kl": 0.0, + "learning_rate": 4.08040027605245e-07, + "loss": 0.0183, + "num_tokens": 20340841.0, + "reward": 1.1166666746139526, + "reward_std": 0.24832773208618164, + "rewards/correctness_reward_func/mean": 0.8166666626930237, + "rewards/correctness_reward_func/std": 0.3857302963733673, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 584 + }, + { + "completion_length": 843.9166870117188, + "completions/clipped_ratio": 0.5, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 2172.0, + "completions/mean_length": 4138.4169921875, + "completions/mean_terminated_length": 1687.8333740234375, + "completions/min_length": 1129.0, + "completions/min_terminated_length": 1129.0, + "epoch": 0.19843962008141114, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.10080356895923615, + "kl": NaN, + "learning_rate": 4.0786749482401655e-07, + "loss": -0.001, + "num_tokens": 20363022.0, + "reward": 0.5833333730697632, + "reward_std": 0.051639772951602936, + "rewards/correctness_reward_func/mean": 0.43333330750465393, + "rewards/correctness_reward_func/std": 0.45792683959007263, + "rewards/format_reward_func/mean": 0.15000000596046448, + "rewards/format_reward_func/std": 0.15666989982128143, + "step": 585 + }, + { + "completion_length": 1471.416748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2505.0, + "completions/max_terminated_length": 2505.0, + "completions/mean_length": 1471.416748046875, + "completions/mean_terminated_length": 1471.416748046875, + "completions/min_length": 909.0, + "completions/min_terminated_length": 909.0, + "epoch": 0.19877883310719133, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.07403448224067688, + "kl": 0.0, + "learning_rate": 4.0769496204278815e-07, + "loss": -0.0001, + "num_tokens": 20390729.0, + "reward": 0.7166666984558105, + "reward_std": 0.040824826806783676, + "rewards/correctness_reward_func/mean": 0.4166666567325592, + "rewards/correctness_reward_func/std": 0.4386619031429291, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 586 + }, + { + "completion_length": 1442.5833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2322.0, + "completions/max_terminated_length": 2322.0, + "completions/mean_length": 1442.5833740234375, + "completions/mean_terminated_length": 1442.5833740234375, + "completions/min_length": 698.0, + "completions/min_terminated_length": 698.0, + "epoch": 0.19911804613297152, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 4.0752242926155965e-07, + "loss": 0.0, + "num_tokens": 20417646.0, + "reward": 0.7999999523162842, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.5, + "rewards/correctness_reward_func/std": 0.5222329497337341, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 587 + }, + { + "completion_length": 1039.4166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1718.0, + "completions/max_terminated_length": 1718.0, + "completions/mean_length": 1039.416748046875, + "completions/mean_terminated_length": 1039.416748046875, + "completions/min_length": 724.0, + "completions/min_terminated_length": 724.0, + "epoch": 0.1994572591587517, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.09593465924263, + "kl": 0.0, + "learning_rate": 4.0734989648033126e-07, + "loss": -0.0003, + "num_tokens": 20442431.0, + "reward": 1.2666666507720947, + "reward_std": 0.051639750599861145, + "rewards/correctness_reward_func/mean": 0.9666666984558105, + "rewards/correctness_reward_func/std": 0.07784988731145859, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 588 + }, + { + "completion_length": 3065.416748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6188.0, + "completions/max_terminated_length": 6188.0, + "completions/mean_length": 3065.416748046875, + "completions/mean_terminated_length": 3065.416748046875, + "completions/min_length": 804.0, + "completions/min_terminated_length": 804.0, + "epoch": 0.1997964721845319, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 4.071773636991028e-07, + "loss": 0.0, + "num_tokens": 20492140.0, + "reward": 0.30000001192092896, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.0, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 589 + }, + { + "completion_length": 704.8333435058594, + "completions/clipped_ratio": 0.0, + "completions/max_length": 965.0, + "completions/max_terminated_length": 965.0, + "completions/mean_length": 704.8333740234375, + "completions/mean_terminated_length": 704.8333740234375, + "completions/min_length": 576.0, + "completions/min_terminated_length": 576.0, + "epoch": 0.20013568521031208, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.2550463540937926e-07, + "kl": 0.0, + "learning_rate": 4.0700483091787437e-07, + "loss": 0.0, + "num_tokens": 20516426.0, + "reward": 1.1000001430511475, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 590 + }, + { + "completion_length": 1735.8333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3626.0, + "completions/max_terminated_length": 3626.0, + "completions/mean_length": 1735.8333740234375, + "completions/mean_terminated_length": 1735.8333740234375, + "completions/min_length": 1012.0, + "completions/min_terminated_length": 1012.0, + "epoch": 0.20047489823609227, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.45365244150161743, + "kl": 0.0, + "learning_rate": 4.068322981366459e-07, + "loss": 0.0057, + "num_tokens": 20547852.0, + "reward": 0.5666667222976685, + "reward_std": 0.20655912160873413, + "rewards/correctness_reward_func/mean": 0.2666666805744171, + "rewards/correctness_reward_func/std": 0.393892765045166, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 591 + }, + { + "completion_length": 1736.8333740234375, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 4041.0, + "completions/mean_length": 2285.916748046875, + "completions/mean_terminated_length": 1894.727294921875, + "completions/min_length": 598.0, + "completions/min_terminated_length": 598.0, + "epoch": 0.20081411126187246, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2545448839664459, + "kl": NaN, + "learning_rate": 4.066597653554175e-07, + "loss": -0.034, + "num_tokens": 20578780.0, + "reward": 1.0750000476837158, + "reward_std": 0.29088661074638367, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.2696799635887146, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.08660254627466202, + "step": 592 + }, + { + "completion_length": 1546.9166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2271.0, + "completions/max_terminated_length": 2271.0, + "completions/mean_length": 1546.916748046875, + "completions/mean_terminated_length": 1546.916748046875, + "completions/min_length": 638.0, + "completions/min_terminated_length": 638.0, + "epoch": 0.20115332428765265, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.09835981577634811, + "kl": 0.0, + "learning_rate": 4.064872325741891e-07, + "loss": -0.0017, + "num_tokens": 20607141.0, + "reward": 1.1166667938232422, + "reward_std": 0.040824826806783676, + "rewards/correctness_reward_func/mean": 0.8166666626930237, + "rewards/correctness_reward_func/std": 0.05773502215743065, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 593 + }, + { + "completion_length": 2359.0833740234375, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 4462.0, + "completions/mean_length": 2908.166748046875, + "completions/mean_terminated_length": 2573.54541015625, + "completions/min_length": 781.0, + "completions/min_terminated_length": 781.0, + "epoch": 0.20149253731343283, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.07446026802062988, + "kl": NaN, + "learning_rate": 4.0631469979296063e-07, + "loss": -0.004, + "num_tokens": 20643640.0, + "reward": 0.27500003576278687, + "reward_std": 0.06123724579811096, + "rewards/correctness_reward_func/mean": 0.0, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.08660254627466202, + "step": 594 + }, + { + "completion_length": 1952.8333740234375, + "completions/clipped_ratio": 0.16666666666666663, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 6144.0, + "completions/mean_length": 3051.0, + "completions/mean_terminated_length": 2343.400146484375, + "completions/min_length": 534.0, + "completions/min_terminated_length": 534.0, + "epoch": 0.20183175033921302, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.30042797327041626, + "kl": NaN, + "learning_rate": 4.061421670117322e-07, + "loss": -0.0247, + "num_tokens": 20680088.0, + "reward": 0.6375000476837158, + "reward_std": 0.07373940199613571, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.23750001192092896, + "rewards/format_reward_func/std": 0.11894422769546509, + "step": 595 + }, + { + "completion_length": 1296.2500610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2377.0, + "completions/max_terminated_length": 2377.0, + "completions/mean_length": 1296.25, + "completions/mean_terminated_length": 1296.25, + "completions/min_length": 656.0, + "completions/min_terminated_length": 656.0, + "epoch": 0.2021709633649932, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 4.059696342305038e-07, + "loss": 0.0, + "num_tokens": 20705621.0, + "reward": 0.7999999523162842, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.5, + "rewards/correctness_reward_func/std": 0.5222329497337341, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 596 + }, + { + "completion_length": 1768.0833435058594, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4853.0, + "completions/max_terminated_length": 4853.0, + "completions/mean_length": 1768.0833740234375, + "completions/mean_terminated_length": 1768.0833740234375, + "completions/min_length": 545.0, + "completions/min_terminated_length": 545.0, + "epoch": 0.2025101763907734, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.6289086463293643e-07, + "kl": 0.0, + "learning_rate": 4.057971014492754e-07, + "loss": 0.0, + "num_tokens": 20738388.0, + "reward": 0.7000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 597 + }, + { + "completion_length": 2543.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4530.0, + "completions/max_terminated_length": 4530.0, + "completions/mean_length": 2543.0, + "completions/mean_terminated_length": 2543.0, + "completions/min_length": 1381.0, + "completions/min_terminated_length": 1381.0, + "epoch": 0.2028493894165536, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.6963534221758891e-07, + "kl": 0.0, + "learning_rate": 4.056245686680469e-07, + "loss": 0.0, + "num_tokens": 20782416.0, + "reward": 1.2000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 598 + }, + { + "completion_length": 1435.3333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2731.0, + "completions/max_terminated_length": 2731.0, + "completions/mean_length": 1435.3333740234375, + "completions/mean_terminated_length": 1435.3333740234375, + "completions/min_length": 720.0, + "completions/min_terminated_length": 720.0, + "epoch": 0.20318860244233378, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.1025226041674614, + "kl": 0.0, + "learning_rate": 4.054520358868185e-07, + "loss": 0.0002, + "num_tokens": 20813830.0, + "reward": 1.1500000953674316, + "reward_std": 0.05477222427725792, + "rewards/correctness_reward_func/mean": 0.8500000834465027, + "rewards/correctness_reward_func/std": 0.09045339375734329, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 599 + }, + { + "completion_length": 794.7500305175781, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1313.0, + "completions/max_terminated_length": 1313.0, + "completions/mean_length": 794.75, + "completions/mean_terminated_length": 794.75, + "completions/min_length": 557.0, + "completions/min_terminated_length": 557.0, + "epoch": 0.20352781546811397, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.06588659435510635, + "kl": 0.0, + "learning_rate": 4.0527950310559005e-07, + "loss": 0.0002, + "num_tokens": 20837977.0, + "reward": 1.2666666507720947, + "reward_std": 0.051639750599861145, + "rewards/correctness_reward_func/mean": 0.9666666984558105, + "rewards/correctness_reward_func/std": 0.07784988731145859, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 600 + }, + { + "completion_length": 1211.3333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1856.0, + "completions/max_terminated_length": 1856.0, + "completions/mean_length": 1211.3333740234375, + "completions/mean_terminated_length": 1211.3333740234375, + "completions/min_length": 832.0, + "completions/min_terminated_length": 832.0, + "epoch": 0.20386702849389415, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.0307384457064472e-07, + "kl": 0.0, + "learning_rate": 4.051069703243616e-07, + "loss": 0.0, + "num_tokens": 20861033.0, + "reward": 1.2000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 601 + }, + { + "completion_length": 2811.416748046875, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 6511.0, + "completions/mean_length": 3360.5, + "completions/mean_terminated_length": 3067.0, + "completions/min_length": 1628.0, + "completions/min_terminated_length": 1628.0, + "epoch": 0.20420624151967434, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.17239151895046234, + "kl": NaN, + "learning_rate": 4.0493443754313316e-07, + "loss": -0.0131, + "num_tokens": 20907724.0, + "reward": 1.2208333015441895, + "reward_std": 0.1265007108449936, + "rewards/correctness_reward_func/mean": 0.9333333373069763, + "rewards/correctness_reward_func/std": 0.0984731912612915, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 602 + }, + { + "completion_length": 2544.7501220703125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5494.0, + "completions/max_terminated_length": 5494.0, + "completions/mean_length": 2544.75, + "completions/mean_terminated_length": 2544.75, + "completions/min_length": 660.0, + "completions/min_terminated_length": 660.0, + "epoch": 0.20454545454545456, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5830628871917725, + "kl": 0.0, + "learning_rate": 4.0476190476190476e-07, + "loss": -0.0089, + "num_tokens": 20947273.0, + "reward": 0.9666666984558105, + "reward_std": 0.2707287669181824, + "rewards/correctness_reward_func/mean": 0.6666666865348816, + "rewards/correctness_reward_func/std": 0.4119429290294647, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 603 + }, + { + "completion_length": 2008.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3416.0, + "completions/max_terminated_length": 3416.0, + "completions/mean_length": 2008.25, + "completions/mean_terminated_length": 2008.25, + "completions/min_length": 621.0, + "completions/min_terminated_length": 621.0, + "epoch": 0.20488466757123475, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.4334089159965515, + "kl": 0.0, + "learning_rate": 4.045893719806763e-07, + "loss": -0.0058, + "num_tokens": 20984080.0, + "reward": 0.6208333969116211, + "reward_std": 0.19391795992851257, + "rewards/correctness_reward_func/mean": 0.3333333432674408, + "rewards/correctness_reward_func/std": 0.4119429290294647, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 604 + }, + { + "completion_length": 1733.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3784.0, + "completions/max_terminated_length": 3784.0, + "completions/mean_length": 1733.0, + "completions/mean_terminated_length": 1733.0, + "completions/min_length": 863.0, + "completions/min_terminated_length": 863.0, + "epoch": 0.20522388059701493, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.4287903904914856, + "kl": 0.0, + "learning_rate": 4.0441683919944787e-07, + "loss": 0.0233, + "num_tokens": 21015982.0, + "reward": 0.833333432674408, + "reward_std": 0.20655912160873413, + "rewards/correctness_reward_func/mean": 0.5333333611488342, + "rewards/correctness_reward_func/std": 0.3938928246498108, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 605 + }, + { + "completion_length": 2514.166748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4962.0, + "completions/max_terminated_length": 4962.0, + "completions/mean_length": 2514.166748046875, + "completions/mean_terminated_length": 2514.166748046875, + "completions/min_length": 1261.0, + "completions/min_terminated_length": 1261.0, + "epoch": 0.20556309362279512, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6668354272842407, + "kl": 0.0, + "learning_rate": 4.042443064182194e-07, + "loss": 0.03, + "num_tokens": 21056592.0, + "reward": 1.070833444595337, + "reward_std": 0.2486901879310608, + "rewards/correctness_reward_func/mean": 0.7833333015441895, + "rewards/correctness_reward_func/std": 0.26227444410324097, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 606 + }, + { + "completion_length": 1062.1666870117188, + "completions/clipped_ratio": 0.16666666666666663, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 3721.0, + "completions/mean_length": 2160.33349609375, + "completions/mean_terminated_length": 1274.5999755859375, + "completions/min_length": 367.0, + "completions/min_terminated_length": 367.0, + "epoch": 0.2059023066485753, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.10183367133140564, + "kl": NaN, + "learning_rate": 4.0407177363699103e-07, + "loss": -0.0125, + "num_tokens": 21083894.0, + "reward": 0.6500000953674316, + "reward_std": 0.0774596780538559, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.25, + "rewards/format_reward_func/std": 0.11677484214305878, + "step": 607 + }, + { + "completion_length": 2220.5001220703125, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 6247.0, + "completions/mean_length": 2769.58349609375, + "completions/mean_terminated_length": 2422.36376953125, + "completions/min_length": 623.0, + "completions/min_terminated_length": 623.0, + "epoch": 0.2062415196743555, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11050547659397125, + "kl": NaN, + "learning_rate": 4.038992408557626e-07, + "loss": -0.0143, + "num_tokens": 21120482.0, + "reward": 0.7416666746139526, + "reward_std": 0.1128769963979721, + "rewards/correctness_reward_func/mean": 0.46666666865348816, + "rewards/correctness_reward_func/std": 0.4923659563064575, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.08660254627466202, + "step": 608 + }, + { + "completion_length": 1264.5833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1872.0, + "completions/max_terminated_length": 1872.0, + "completions/mean_length": 1264.5833740234375, + "completions/mean_terminated_length": 1264.5833740234375, + "completions/min_length": 664.0, + "completions/min_terminated_length": 664.0, + "epoch": 0.2065807327001357, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.325391423279143e-07, + "kl": 0.0, + "learning_rate": 4.0372670807453413e-07, + "loss": 0.0, + "num_tokens": 21142509.0, + "reward": 0.7000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 609 + }, + { + "completion_length": 1594.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3316.0, + "completions/max_terminated_length": 3316.0, + "completions/mean_length": 1594.5, + "completions/mean_terminated_length": 1594.5, + "completions/min_length": 465.0, + "completions/min_terminated_length": 465.0, + "epoch": 0.20691994572591588, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.580161020039668e-08, + "kl": 0.0, + "learning_rate": 4.035541752933057e-07, + "loss": 0.0, + "num_tokens": 21176565.0, + "reward": 0.7000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 610 + }, + { + "completion_length": 2312.916748046875, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5161.0, + "completions/mean_length": 2862.0, + "completions/mean_terminated_length": 2523.181884765625, + "completions/min_length": 646.0, + "completions/min_terminated_length": 646.0, + "epoch": 0.20725915875169607, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.09654112905263901, + "kl": NaN, + "learning_rate": 4.033816425120773e-07, + "loss": -0.0072, + "num_tokens": 21219494.0, + "reward": 0.6750000715255737, + "reward_std": 0.06123724579811096, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.08660253882408142, + "step": 611 + }, + { + "completion_length": 1352.666748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2310.0, + "completions/max_terminated_length": 2310.0, + "completions/mean_length": 1352.666748046875, + "completions/mean_terminated_length": 1352.666748046875, + "completions/min_length": 523.0, + "completions/min_terminated_length": 523.0, + "epoch": 0.20759837177747625, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.9667629658215446e-07, + "kl": 0.0, + "learning_rate": 4.032091097308488e-07, + "loss": 0.0, + "num_tokens": 21250126.0, + "reward": 0.7000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 612 + }, + { + "completion_length": 2175.5833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4093.0, + "completions/max_terminated_length": 4093.0, + "completions/mean_length": 2175.58349609375, + "completions/mean_terminated_length": 2175.58349609375, + "completions/min_length": 1203.0, + "completions/min_terminated_length": 1203.0, + "epoch": 0.20793758480325644, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5248443484306335, + "kl": 0.0, + "learning_rate": 4.030365769496204e-07, + "loss": 0.0088, + "num_tokens": 21281873.0, + "reward": 1.1666667461395264, + "reward_std": 0.2588964104652405, + "rewards/correctness_reward_func/mean": 0.8666666150093079, + "rewards/correctness_reward_func/std": 0.287096232175827, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 613 + }, + { + "completion_length": 2334.75, + "completions/clipped_ratio": 0.33333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 6393.0, + "completions/mean_length": 4531.08349609375, + "completions/mean_terminated_length": 3502.125, + "completions/min_length": 880.0, + "completions/min_terminated_length": 880.0, + "epoch": 0.20827679782903663, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.24544914066791534, + "kl": NaN, + "learning_rate": 4.02864044168392e-07, + "loss": -0.0219, + "num_tokens": 21318176.0, + "reward": 0.6000000834465027, + "reward_std": 0.0774596780538559, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.20000000298023224, + "rewards/format_reward_func/std": 0.14770980179309845, + "step": 614 + }, + { + "completion_length": 606.3333435058594, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1353.0, + "completions/max_terminated_length": 1353.0, + "completions/mean_length": 606.3333740234375, + "completions/mean_terminated_length": 606.3333740234375, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "epoch": 0.20861601085481682, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.37288540601730347, + "kl": 0.0, + "learning_rate": 4.0269151138716356e-07, + "loss": -0.0039, + "num_tokens": 21336690.0, + "reward": 1.1041667461395264, + "reward_std": 0.23474276065826416, + "rewards/correctness_reward_func/mean": 0.8166666030883789, + "rewards/correctness_reward_func/std": 0.27579087018966675, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 615 + }, + { + "completion_length": 1619.416748046875, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 4832.0, + "completions/mean_length": 2168.5, + "completions/mean_terminated_length": 1766.636474609375, + "completions/min_length": 653.0, + "completions/min_terminated_length": 653.0, + "epoch": 0.208955223880597, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.07448533922433853, + "kl": NaN, + "learning_rate": 4.025189786059351e-07, + "loss": -0.0086, + "num_tokens": 21372959.0, + "reward": 0.7749999761581421, + "reward_std": 0.06123724579811096, + "rewards/correctness_reward_func/mean": 0.5, + "rewards/correctness_reward_func/std": 0.5222329497337341, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.08660253882408142, + "step": 616 + }, + { + "completion_length": 2493.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5083.0, + "completions/max_terminated_length": 5083.0, + "completions/mean_length": 2493.75, + "completions/mean_terminated_length": 2493.75, + "completions/min_length": 848.0, + "completions/min_terminated_length": 848.0, + "epoch": 0.2092944369063772, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.4945359794182878e-07, + "kl": 0.0, + "learning_rate": 4.0234644582470666e-07, + "loss": 0.0, + "num_tokens": 21419780.0, + "reward": 0.7000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 617 + }, + { + "completion_length": 1921.0833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2628.0, + "completions/max_terminated_length": 2628.0, + "completions/mean_length": 1921.0833740234375, + "completions/mean_terminated_length": 1921.0833740234375, + "completions/min_length": 1042.0, + "completions/min_terminated_length": 1042.0, + "epoch": 0.20963364993215738, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 4.0217391304347827e-07, + "loss": 0.0, + "num_tokens": 21449523.0, + "reward": 1.2999999523162842, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 1.0, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 618 + }, + { + "completion_length": 701.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1136.0, + "completions/max_terminated_length": 1136.0, + "completions/mean_length": 701.75, + "completions/mean_terminated_length": 701.75, + "completions/min_length": 426.0, + "completions/min_terminated_length": 426.0, + "epoch": 0.20997286295793757, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.060739900916814804, + "kl": 0.0, + "learning_rate": 4.020013802622498e-07, + "loss": 0.0002, + "num_tokens": 21467394.0, + "reward": 1.1166667938232422, + "reward_std": 0.040824826806783676, + "rewards/correctness_reward_func/mean": 0.8166666626930237, + "rewards/correctness_reward_func/std": 0.05773502215743065, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 619 + }, + { + "completion_length": 1590.416748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3286.0, + "completions/max_terminated_length": 3286.0, + "completions/mean_length": 1590.416748046875, + "completions/mean_terminated_length": 1590.416748046875, + "completions/min_length": 756.0, + "completions/min_terminated_length": 756.0, + "epoch": 0.21031207598371776, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14383164048194885, + "kl": 0.0, + "learning_rate": 4.018288474810214e-07, + "loss": 0.0018, + "num_tokens": 21492803.0, + "reward": 1.183333396911621, + "reward_std": 0.09246458858251572, + "rewards/correctness_reward_func/mean": 0.8833333849906921, + "rewards/correctness_reward_func/std": 0.10298572480678558, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 620 + }, + { + "completion_length": 2816.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4851.0, + "completions/max_terminated_length": 4851.0, + "completions/mean_length": 2816.75, + "completions/mean_terminated_length": 2816.75, + "completions/min_length": 1029.0, + "completions/min_terminated_length": 1029.0, + "epoch": 0.21065128900949798, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.1251380890607834, + "kl": 0.0, + "learning_rate": 4.0165631469979293e-07, + "loss": -0.0009, + "num_tokens": 21541196.0, + "reward": 1.133333444595337, + "reward_std": 0.05163976177573204, + "rewards/correctness_reward_func/mean": 0.8333333134651184, + "rewards/correctness_reward_func/std": 0.07784988731145859, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 621 + }, + { + "completion_length": 977.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1705.0, + "completions/max_terminated_length": 1705.0, + "completions/mean_length": 977.25, + "completions/mean_terminated_length": 977.25, + "completions/min_length": 595.0, + "completions/min_terminated_length": 595.0, + "epoch": 0.21099050203527817, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 4.0148378191856453e-07, + "loss": 0.0, + "num_tokens": 21562223.0, + "reward": 0.7999999523162842, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.5, + "rewards/correctness_reward_func/std": 0.5222329497337341, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 622 + }, + { + "completion_length": 1879.0834350585938, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4423.0, + "completions/max_terminated_length": 4423.0, + "completions/mean_length": 1879.0833740234375, + "completions/mean_terminated_length": 1879.0833740234375, + "completions/min_length": 489.0, + "completions/min_terminated_length": 489.0, + "epoch": 0.21132971506105835, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.135577991604805, + "kl": 0.0, + "learning_rate": 4.0131124913733603e-07, + "loss": -0.0018, + "num_tokens": 21602580.0, + "reward": 1.1541666984558105, + "reward_std": 0.06497842073440552, + "rewards/correctness_reward_func/mean": 0.8666666150093079, + "rewards/correctness_reward_func/std": 0.0984731912612915, + "rewards/format_reward_func/mean": 0.2875000238418579, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 623 + }, + { + "completion_length": 2042.5, + "completions/clipped_ratio": 0.25, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 6396.0, + "completions/mean_length": 3689.75, + "completions/mean_terminated_length": 2723.333251953125, + "completions/min_length": 858.0, + "completions/min_terminated_length": 858.0, + "epoch": 0.21166892808683854, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0774798393249512, + "kl": NaN, + "learning_rate": 4.0113871635610764e-07, + "loss": -0.1023, + "num_tokens": 21640638.0, + "reward": 0.8916667699813843, + "reward_std": 0.3968444764614105, + "rewards/correctness_reward_func/mean": 0.6666666269302368, + "rewards/correctness_reward_func/std": 0.4119429290294647, + "rewards/format_reward_func/mean": 0.22500000894069672, + "rewards/format_reward_func/std": 0.13568010926246643, + "step": 624 + }, + { + "completion_length": 1797.5833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4817.0, + "completions/max_terminated_length": 4817.0, + "completions/mean_length": 1797.5833740234375, + "completions/mean_terminated_length": 1797.5833740234375, + "completions/min_length": 551.0, + "completions/min_terminated_length": 551.0, + "epoch": 0.21200814111261873, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 4.009661835748792e-07, + "loss": 0.0, + "num_tokens": 21676483.0, + "reward": 1.2999999523162842, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 1.0, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 625 + }, + { + "completion_length": 2595.0001220703125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5171.0, + "completions/max_terminated_length": 5171.0, + "completions/mean_length": 2595.0, + "completions/mean_terminated_length": 2595.0, + "completions/min_length": 508.0, + "completions/min_terminated_length": 508.0, + "epoch": 0.21234735413839892, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4653422236442566, + "kl": 0.0, + "learning_rate": 4.007936507936508e-07, + "loss": -0.0322, + "num_tokens": 21721933.0, + "reward": 1.1708333492279053, + "reward_std": 0.2519000172615051, + "rewards/correctness_reward_func/mean": 0.8833333849906921, + "rewards/correctness_reward_func/std": 0.28867512941360474, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 626 + }, + { + "completion_length": 768.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1599.0, + "completions/max_terminated_length": 1599.0, + "completions/mean_length": 768.5, + "completions/mean_terminated_length": 768.5, + "completions/min_length": 427.0, + "completions/min_terminated_length": 427.0, + "epoch": 0.2126865671641791, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.05752098560333252, + "kl": 0.0, + "learning_rate": 4.006211180124223e-07, + "loss": -0.0001, + "num_tokens": 21742123.0, + "reward": 0.7166666984558105, + "reward_std": 0.040824826806783676, + "rewards/correctness_reward_func/mean": 0.4166666567325592, + "rewards/correctness_reward_func/std": 0.4386619031429291, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 627 + }, + { + "completion_length": 1799.0833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3195.0, + "completions/max_terminated_length": 3195.0, + "completions/mean_length": 1799.0833740234375, + "completions/mean_terminated_length": 1799.0833740234375, + "completions/min_length": 731.0, + "completions/min_terminated_length": 731.0, + "epoch": 0.2130257801899593, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.11317743360996246, + "kl": 0.0, + "learning_rate": 4.004485852311939e-07, + "loss": 0.0022, + "num_tokens": 21775028.0, + "reward": 0.7041667699813843, + "reward_std": 0.05571504682302475, + "rewards/correctness_reward_func/mean": 0.4166666567325592, + "rewards/correctness_reward_func/std": 0.4386619031429291, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 628 + }, + { + "completion_length": 2136.5834350585938, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4480.0, + "completions/max_terminated_length": 4480.0, + "completions/mean_length": 2136.58349609375, + "completions/mean_terminated_length": 2136.58349609375, + "completions/min_length": 461.0, + "completions/min_terminated_length": 461.0, + "epoch": 0.21336499321573948, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.4856317420708365e-07, + "kl": 0.0, + "learning_rate": 4.002760524499655e-07, + "loss": 0.0, + "num_tokens": 21813543.0, + "reward": 0.7000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 629 + }, + { + "completion_length": 1928.5833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5380.0, + "completions/max_terminated_length": 5380.0, + "completions/mean_length": 1928.5833740234375, + "completions/mean_terminated_length": 1928.5833740234375, + "completions/min_length": 874.0, + "completions/min_terminated_length": 874.0, + "epoch": 0.21370420624151967, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.1090475544333458, + "kl": 0.0, + "learning_rate": 4.0010351966873706e-07, + "loss": 0.0027, + "num_tokens": 21848944.0, + "reward": 0.7833333015441895, + "reward_std": 0.040824808180332184, + "rewards/correctness_reward_func/mean": 0.4833333492279053, + "rewards/correctness_reward_func/std": 0.5078176856040955, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 630 + }, + { + "completion_length": 1004.1666870117188, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 3440.0, + "completions/mean_length": 1553.25, + "completions/mean_terminated_length": 1095.45458984375, + "completions/min_length": 449.0, + "completions/min_terminated_length": 449.0, + "epoch": 0.21404341926729986, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.3620413541793823, + "kl": NaN, + "learning_rate": 3.999309868875086e-07, + "loss": -0.0181, + "num_tokens": 21870372.0, + "reward": 1.1583333015441895, + "reward_std": 0.25380438566207886, + "rewards/correctness_reward_func/mean": 0.8833333849906921, + "rewards/correctness_reward_func/std": 0.28867512941360474, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.08660253882408142, + "step": 631 + }, + { + "completion_length": 2100.7501220703125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3441.0, + "completions/max_terminated_length": 3441.0, + "completions/mean_length": 2100.75, + "completions/mean_terminated_length": 2100.75, + "completions/min_length": 830.0, + "completions/min_terminated_length": 830.0, + "epoch": 0.21438263229308005, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1738094985485077, + "kl": 0.0, + "learning_rate": 3.9975845410628017e-07, + "loss": 0.0015, + "num_tokens": 21907617.0, + "reward": 1.2166666984558105, + "reward_std": 0.10641199350357056, + "rewards/correctness_reward_func/mean": 0.9166666865348816, + "rewards/correctness_reward_func/std": 0.10298572480678558, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 632 + }, + { + "completion_length": 3179.416748046875, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 6277.0, + "completions/mean_length": 3728.5, + "completions/mean_terminated_length": 3468.45458984375, + "completions/min_length": 1711.0, + "completions/min_terminated_length": 1711.0, + "epoch": 0.21472184531886024, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.8372994661331177, + "kl": NaN, + "learning_rate": 3.9958592132505177e-07, + "loss": -0.0076, + "num_tokens": 21959930.0, + "reward": 0.8750001788139343, + "reward_std": 0.2524876296520233, + "rewards/correctness_reward_func/mean": 0.5999999642372131, + "rewards/correctness_reward_func/std": 0.36181363463401794, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.08660253882408142, + "step": 633 + }, + { + "completion_length": 1077.4166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2167.0, + "completions/max_terminated_length": 2167.0, + "completions/mean_length": 1077.416748046875, + "completions/mean_terminated_length": 1077.416748046875, + "completions/min_length": 652.0, + "completions/min_terminated_length": 652.0, + "epoch": 0.21506105834464043, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.7163471827407193e-07, + "kl": 0.0, + "learning_rate": 3.9941338854382327e-07, + "loss": 0.0, + "num_tokens": 21984481.0, + "reward": 1.1000001430511475, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 634 + }, + { + "completion_length": 1683.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3120.0, + "completions/max_terminated_length": 3120.0, + "completions/mean_length": 1683.0, + "completions/mean_terminated_length": 1683.0, + "completions/min_length": 1112.0, + "completions/min_terminated_length": 1112.0, + "epoch": 0.21540027137042062, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.3120736614146153e-07, + "kl": 0.0, + "learning_rate": 3.992408557625949e-07, + "loss": 0.0, + "num_tokens": 22015363.0, + "reward": 1.1000001430511475, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 635 + }, + { + "completion_length": 1880.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4715.0, + "completions/max_terminated_length": 4715.0, + "completions/mean_length": 1880.0, + "completions/mean_terminated_length": 1880.0, + "completions/min_length": 657.0, + "completions/min_terminated_length": 657.0, + "epoch": 0.2157394843962008, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.4236021545220865e-07, + "kl": 0.0, + "learning_rate": 3.9906832298136643e-07, + "loss": 0.0, + "num_tokens": 22054771.0, + "reward": 1.1000001430511475, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 636 + }, + { + "completion_length": 1170.9166870117188, + "completions/clipped_ratio": 0.5, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 4163.0, + "completions/mean_length": 4465.4169921875, + "completions/mean_terminated_length": 2341.83349609375, + "completions/min_length": 1286.0, + "completions/min_terminated_length": 1286.0, + "epoch": 0.216078697421981, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.13113607466220856, + "kl": NaN, + "learning_rate": 3.9889579020013804e-07, + "loss": -0.0029, + "num_tokens": 22081302.0, + "reward": 0.6166666746139526, + "reward_std": 0.05163975805044174, + "rewards/correctness_reward_func/mean": 0.46666666865348816, + "rewards/correctness_reward_func/std": 0.4923659563064575, + "rewards/format_reward_func/mean": 0.15000000596046448, + "rewards/format_reward_func/std": 0.15666989982128143, + "step": 637 + }, + { + "completion_length": 1980.0000610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5188.0, + "completions/max_terminated_length": 5188.0, + "completions/mean_length": 1980.0, + "completions/mean_terminated_length": 1980.0, + "completions/min_length": 435.0, + "completions/min_terminated_length": 435.0, + "epoch": 0.21641791044776118, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6408343315124512, + "kl": 0.0, + "learning_rate": 3.9872325741890954e-07, + "loss": -0.0118, + "num_tokens": 22119900.0, + "reward": 0.8833333849906921, + "reward_std": 0.20412413775920868, + "rewards/correctness_reward_func/mean": 0.5833333134651184, + "rewards/correctness_reward_func/std": 0.5149286389350891, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 638 + }, + { + "completion_length": 3710.666748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6489.0, + "completions/max_terminated_length": 6489.0, + "completions/mean_length": 3710.666748046875, + "completions/mean_terminated_length": 3710.666748046875, + "completions/min_length": 2540.0, + "completions/min_terminated_length": 2540.0, + "epoch": 0.2167571234735414, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0129146575927734, + "kl": 0.0, + "learning_rate": 3.9855072463768114e-07, + "loss": 0.016, + "num_tokens": 22177430.0, + "reward": 1.1000001430511475, + "reward_std": 0.21493908762931824, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.2696799635887146, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 639 + }, + { + "completion_length": 1299.4166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2228.0, + "completions/max_terminated_length": 2228.0, + "completions/mean_length": 1299.416748046875, + "completions/mean_terminated_length": 1299.416748046875, + "completions/min_length": 931.0, + "completions/min_terminated_length": 931.0, + "epoch": 0.21709633649932158, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.09300534427165985, + "kl": 0.0, + "learning_rate": 3.983781918564527e-07, + "loss": 0.0, + "num_tokens": 22206049.0, + "reward": 1.25, + "reward_std": 0.05477223917841911, + "rewards/correctness_reward_func/mean": 0.949999988079071, + "rewards/correctness_reward_func/std": 0.09045339375734329, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 640 + }, + { + "completion_length": 3040.0833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6160.0, + "completions/max_terminated_length": 6160.0, + "completions/mean_length": 3040.08349609375, + "completions/mean_terminated_length": 3040.08349609375, + "completions/min_length": 627.0, + "completions/min_terminated_length": 627.0, + "epoch": 0.21743554952510177, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.1536818891763687, + "kl": 0.0, + "learning_rate": 3.982056590752243e-07, + "loss": 0.0004, + "num_tokens": 22256750.0, + "reward": 1.2666666507720947, + "reward_std": 0.051639750599861145, + "rewards/correctness_reward_func/mean": 0.9666666984558105, + "rewards/correctness_reward_func/std": 0.07784988731145859, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 641 + }, + { + "completion_length": 2255.2500610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5187.0, + "completions/max_terminated_length": 5187.0, + "completions/mean_length": 2255.25, + "completions/mean_terminated_length": 2255.25, + "completions/min_length": 754.0, + "completions/min_terminated_length": 754.0, + "epoch": 0.21777476255088196, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.8280144929885864, + "kl": 0.0, + "learning_rate": 3.980331262939958e-07, + "loss": 0.0191, + "num_tokens": 22293539.0, + "reward": 0.6333333253860474, + "reward_std": 0.1632993221282959, + "rewards/correctness_reward_func/mean": 0.3333333432674408, + "rewards/correctness_reward_func/std": 0.4119429290294647, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 642 + }, + { + "completion_length": 1590.8333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5887.0, + "completions/max_terminated_length": 5887.0, + "completions/mean_length": 1590.8333740234375, + "completions/mean_terminated_length": 1590.8333740234375, + "completions/min_length": 726.0, + "completions/min_terminated_length": 726.0, + "epoch": 0.21811397557666215, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.10805870592594147, + "kl": 0.0, + "learning_rate": 3.978605935127674e-07, + "loss": 0.0101, + "num_tokens": 22323081.0, + "reward": 1.2833333015441895, + "reward_std": 0.040824808180332184, + "rewards/correctness_reward_func/mean": 0.9833333492279053, + "rewards/correctness_reward_func/std": 0.05773502215743065, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 643 + }, + { + "completion_length": 2101.8334350585938, + "completions/clipped_ratio": 0.25, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 6169.0, + "completions/mean_length": 3749.08349609375, + "completions/mean_terminated_length": 2802.444580078125, + "completions/min_length": 743.0, + "completions/min_terminated_length": 743.0, + "epoch": 0.21845318860244234, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.9981639385223389, + "kl": NaN, + "learning_rate": 3.97688060731539e-07, + "loss": -0.0685, + "num_tokens": 22360183.0, + "reward": 0.7208334803581238, + "reward_std": 0.2441396415233612, + "rewards/correctness_reward_func/mean": 0.4833333194255829, + "rewards/correctness_reward_func/std": 0.43029236793518066, + "rewards/format_reward_func/mean": 0.23750001192092896, + "rewards/format_reward_func/std": 0.11894422769546509, + "step": 644 + }, + { + "completion_length": 1810.5833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4908.0, + "completions/max_terminated_length": 4908.0, + "completions/mean_length": 1810.5833740234375, + "completions/mean_terminated_length": 1810.5833740234375, + "completions/min_length": 656.0, + "completions/min_terminated_length": 656.0, + "epoch": 0.21879240162822253, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16437654197216034, + "kl": 0.0, + "learning_rate": 3.975155279503105e-07, + "loss": -0.006, + "num_tokens": 22390598.0, + "reward": 1.1666667461395264, + "reward_std": 0.09559705853462219, + "rewards/correctness_reward_func/mean": 0.8666666150093079, + "rewards/correctness_reward_func/std": 0.0984731912612915, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 645 + }, + { + "completion_length": 1326.6667175292969, + "completions/clipped_ratio": 0.16666666666666663, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 3824.0, + "completions/mean_length": 2424.83349609375, + "completions/mean_terminated_length": 1592.0, + "completions/min_length": 742.0, + "completions/min_terminated_length": 742.0, + "epoch": 0.21913161465400272, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.23768624663352966, + "kl": NaN, + "learning_rate": 3.973429951690821e-07, + "loss": -0.0106, + "num_tokens": 22418914.0, + "reward": 0.6500000953674316, + "reward_std": 0.0774596780538559, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.2500000298023224, + "rewards/format_reward_func/std": 0.11677484214305878, + "step": 646 + }, + { + "completion_length": 951.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3743.0, + "completions/max_terminated_length": 3743.0, + "completions/mean_length": 951.5, + "completions/mean_terminated_length": 951.5, + "completions/min_length": 320.0, + "completions/min_terminated_length": 320.0, + "epoch": 0.2194708276797829, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10783212631940842, + "kl": 0.0, + "learning_rate": 3.9717046238785367e-07, + "loss": -0.0026, + "num_tokens": 22442500.0, + "reward": 1.254166603088379, + "reward_std": 0.08225837349891663, + "rewards/correctness_reward_func/mean": 0.9666666984558105, + "rewards/correctness_reward_func/std": 0.07784988731145859, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 647 + }, + { + "completion_length": 1173.5000610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2590.0, + "completions/max_terminated_length": 2590.0, + "completions/mean_length": 1173.5, + "completions/mean_terminated_length": 1173.5, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "epoch": 0.2198100407055631, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.3344381954993878e-07, + "kl": 0.0, + "learning_rate": 3.969979296066253e-07, + "loss": 0.0, + "num_tokens": 22468684.0, + "reward": 0.7000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 648 + }, + { + "completion_length": 1362.1666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3917.0, + "completions/max_terminated_length": 3917.0, + "completions/mean_length": 1362.166748046875, + "completions/mean_terminated_length": 1362.166748046875, + "completions/min_length": 527.0, + "completions/min_terminated_length": 527.0, + "epoch": 0.22014925373134328, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.06996418535709381, + "kl": 0.0, + "learning_rate": 3.968253968253968e-07, + "loss": -0.0004, + "num_tokens": 22499058.0, + "reward": 1.1666667461395264, + "reward_std": 0.05163975805044174, + "rewards/correctness_reward_func/mean": 0.8666666150093079, + "rewards/correctness_reward_func/std": 0.0984731912612915, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 649 + }, + { + "completion_length": 1067.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1865.0, + "completions/max_terminated_length": 1865.0, + "completions/mean_length": 1067.0, + "completions/mean_terminated_length": 1067.0, + "completions/min_length": 601.0, + "completions/min_terminated_length": 601.0, + "epoch": 0.22048846675712347, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 3.966528640441684e-07, + "loss": 0.0, + "num_tokens": 22521366.0, + "reward": 0.7999999523162842, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.5, + "rewards/correctness_reward_func/std": 0.5222329497337341, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 650 + }, + { + "completion_length": 1641.8333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4213.0, + "completions/max_terminated_length": 4213.0, + "completions/mean_length": 1641.8333740234375, + "completions/mean_terminated_length": 1641.8333740234375, + "completions/min_length": 992.0, + "completions/min_terminated_length": 992.0, + "epoch": 0.22082767978290366, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.08669435232877731, + "kl": 0.0, + "learning_rate": 3.9648033126293993e-07, + "loss": 0.0011, + "num_tokens": 22550260.0, + "reward": 1.25, + "reward_std": 0.05477223917841911, + "rewards/correctness_reward_func/mean": 0.949999988079071, + "rewards/correctness_reward_func/std": 0.09045339375734329, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 651 + }, + { + "completion_length": 2429.0834350585938, + "completions/clipped_ratio": 0.16666666666666663, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 4880.0, + "completions/mean_length": 3527.25, + "completions/mean_terminated_length": 2914.900146484375, + "completions/min_length": 1257.0, + "completions/min_terminated_length": 1257.0, + "epoch": 0.22116689280868385, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.5226991176605225, + "kl": NaN, + "learning_rate": 3.9630779848171154e-07, + "loss": -0.0914, + "num_tokens": 22593785.0, + "reward": 0.9666668176651001, + "reward_std": 0.32506412267684937, + "rewards/correctness_reward_func/mean": 0.7166666984558105, + "rewards/correctness_reward_func/std": 0.3459725081920624, + "rewards/format_reward_func/mean": 0.2500000298023224, + "rewards/format_reward_func/std": 0.11677484214305878, + "step": 652 + }, + { + "completion_length": 1602.3333740234375, + "completions/clipped_ratio": 0.33333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 4316.0, + "completions/mean_length": 3798.666748046875, + "completions/mean_terminated_length": 2403.5, + "completions/min_length": 1219.0, + "completions/min_terminated_length": 1219.0, + "epoch": 0.22150610583446403, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.16389794647693634, + "kl": NaN, + "learning_rate": 3.9613526570048304e-07, + "loss": -0.019, + "num_tokens": 22626027.0, + "reward": 0.20000001788139343, + "reward_std": 0.0774596780538559, + "rewards/correctness_reward_func/mean": 0.0, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.20000000298023224, + "rewards/format_reward_func/std": 0.14770980179309845, + "step": 653 + }, + { + "completion_length": 970.8333435058594, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2010.0, + "completions/max_terminated_length": 2010.0, + "completions/mean_length": 970.8333740234375, + "completions/mean_terminated_length": 970.8333740234375, + "completions/min_length": 497.0, + "completions/min_terminated_length": 497.0, + "epoch": 0.22184531886024422, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.08526045829057693, + "kl": 0.0, + "learning_rate": 3.9596273291925465e-07, + "loss": -0.0016, + "num_tokens": 22649347.0, + "reward": 1.2333333492279053, + "reward_std": 0.051639772951602936, + "rewards/correctness_reward_func/mean": 0.9333333373069763, + "rewards/correctness_reward_func/std": 0.0984731912612915, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 654 + }, + { + "completion_length": 2165.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3378.0, + "completions/max_terminated_length": 3378.0, + "completions/mean_length": 2165.75, + "completions/mean_terminated_length": 2165.75, + "completions/min_length": 1147.0, + "completions/min_terminated_length": 1147.0, + "epoch": 0.2221845318860244, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.11644528061151505, + "kl": 0.0, + "learning_rate": 3.957902001380262e-07, + "loss": 0.0005, + "num_tokens": 22690942.0, + "reward": 1.2666666507720947, + "reward_std": 0.05163975805044174, + "rewards/correctness_reward_func/mean": 0.9666666984558105, + "rewards/correctness_reward_func/std": 0.07784988731145859, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 655 + }, + { + "completion_length": 3226.916748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4712.0, + "completions/max_terminated_length": 4712.0, + "completions/mean_length": 3226.916748046875, + "completions/mean_terminated_length": 3226.916748046875, + "completions/min_length": 1360.0, + "completions/min_terminated_length": 1360.0, + "epoch": 0.2225237449118046, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.16651302576065063, + "kl": 0.0, + "learning_rate": 3.9561766735679775e-07, + "loss": -0.0028, + "num_tokens": 22740867.0, + "reward": 0.75, + "reward_std": 0.05477223917841911, + "rewards/correctness_reward_func/mean": 0.44999995827674866, + "rewards/correctness_reward_func/std": 0.47577688097953796, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 656 + }, + { + "completion_length": 2330.666748046875, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 6434.0, + "completions/mean_length": 2879.75, + "completions/mean_terminated_length": 2542.54541015625, + "completions/min_length": 1151.0, + "completions/min_terminated_length": 1151.0, + "epoch": 0.22286295793758482, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2756352722644806, + "kl": NaN, + "learning_rate": 3.954451345755693e-07, + "loss": -0.0489, + "num_tokens": 22779851.0, + "reward": 1.0750000476837158, + "reward_std": 0.29088661074638367, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.2696799635887146, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.08660254627466202, + "step": 657 + }, + { + "completion_length": 678.5833435058594, + "completions/clipped_ratio": 0.41666666666666663, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 4787.0, + "completions/mean_length": 3424.0, + "completions/mean_terminated_length": 1163.2857666015625, + "completions/min_length": 419.0, + "completions/min_terminated_length": 419.0, + "epoch": 0.223202170963365, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3502046465873718, + "kl": NaN, + "learning_rate": 3.952726017943409e-07, + "loss": -0.0154, + "num_tokens": 22801434.0, + "reward": 0.6083333492279053, + "reward_std": 0.1128770112991333, + "rewards/correctness_reward_func/mean": 0.43333330750465393, + "rewards/correctness_reward_func/std": 0.45792683959007263, + "rewards/format_reward_func/mean": 0.17499999701976776, + "rewards/format_reward_func/std": 0.15447859466075897, + "step": 658 + }, + { + "completion_length": 1686.7500610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5783.0, + "completions/max_terminated_length": 5783.0, + "completions/mean_length": 1686.75, + "completions/mean_terminated_length": 1686.75, + "completions/min_length": 722.0, + "completions/min_terminated_length": 722.0, + "epoch": 0.2235413839891452, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9250512719154358, + "kl": 0.0, + "learning_rate": 3.951000690131125e-07, + "loss": 0.0408, + "num_tokens": 22831161.0, + "reward": 1.0833333730697632, + "reward_std": 0.3129711151123047, + "rewards/correctness_reward_func/mean": 0.7833333015441895, + "rewards/correctness_reward_func/std": 0.37618499994277954, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 659 + }, + { + "completion_length": 2479.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5375.0, + "completions/max_terminated_length": 5375.0, + "completions/mean_length": 2479.5, + "completions/mean_terminated_length": 2479.5, + "completions/min_length": 796.0, + "completions/min_terminated_length": 796.0, + "epoch": 0.22388059701492538, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.5752743482589722, + "kl": 0.0, + "learning_rate": 3.94927536231884e-07, + "loss": 0.0219, + "num_tokens": 22872837.0, + "reward": 1.149999976158142, + "reward_std": 0.17606817185878754, + "rewards/correctness_reward_func/mean": 0.8499999642372131, + "rewards/correctness_reward_func/std": 0.28444525599479675, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 660 + }, + { + "completion_length": 3482.916748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5768.0, + "completions/max_terminated_length": 5768.0, + "completions/mean_length": 3482.916748046875, + "completions/mean_terminated_length": 3482.916748046875, + "completions/min_length": 842.0, + "completions/min_terminated_length": 842.0, + "epoch": 0.22421981004070557, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.8260320425033569, + "kl": 0.0, + "learning_rate": 3.947550034506556e-07, + "loss": -0.0385, + "num_tokens": 22924898.0, + "reward": 1.0166666507720947, + "reward_std": 0.24013885855674744, + "rewards/correctness_reward_func/mean": 0.7166666984558105, + "rewards/correctness_reward_func/std": 0.4386618733406067, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 661 + }, + { + "completion_length": 1033.9167175292969, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1684.0, + "completions/max_terminated_length": 1684.0, + "completions/mean_length": 1033.916748046875, + "completions/mean_terminated_length": 1033.916748046875, + "completions/min_length": 538.0, + "completions/min_terminated_length": 538.0, + "epoch": 0.22455902306648576, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.404193252325058, + "kl": 0.0, + "learning_rate": 3.945824706694272e-07, + "loss": -0.0029, + "num_tokens": 22953091.0, + "reward": 1.058333396911621, + "reward_std": 0.2610875070095062, + "rewards/correctness_reward_func/mean": 0.7833333015441895, + "rewards/correctness_reward_func/std": 0.37618499994277954, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.05838742107152939, + "step": 662 + }, + { + "completion_length": 1029.3333435058594, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2507.0, + "completions/max_terminated_length": 2507.0, + "completions/mean_length": 1029.3333740234375, + "completions/mean_terminated_length": 1029.3333740234375, + "completions/min_length": 582.0, + "completions/min_terminated_length": 582.0, + "epoch": 0.22489823609226595, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.0883605483513747e-07, + "kl": 0.0, + "learning_rate": 3.944099378881988e-07, + "loss": 0.0, + "num_tokens": 22976513.0, + "reward": 1.2000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 663 + }, + { + "completion_length": 670.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 888.0, + "completions/max_terminated_length": 888.0, + "completions/mean_length": 670.0, + "completions/mean_terminated_length": 670.0, + "completions/min_length": 438.0, + "completions/min_terminated_length": 438.0, + "epoch": 0.22523744911804613, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.06571773439645767, + "kl": 0.0, + "learning_rate": 3.942374051069703e-07, + "loss": -0.0005, + "num_tokens": 22999475.0, + "reward": 1.1166667938232422, + "reward_std": 0.040824826806783676, + "rewards/correctness_reward_func/mean": 0.8166666626930237, + "rewards/correctness_reward_func/std": 0.05773502215743065, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 664 + }, + { + "completion_length": 2176.8333740234375, + "completions/clipped_ratio": 0.16666666666666663, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 6299.0, + "completions/mean_length": 3275.0, + "completions/mean_terminated_length": 2612.199951171875, + "completions/min_length": 997.0, + "completions/min_terminated_length": 997.0, + "epoch": 0.22557666214382632, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.5015944838523865, + "kl": NaN, + "learning_rate": 3.940648723257419e-07, + "loss": 0.0043, + "num_tokens": 23037429.0, + "reward": 0.8166667819023132, + "reward_std": 0.30441200733184814, + "rewards/correctness_reward_func/mean": 0.5666666626930237, + "rewards/correctness_reward_func/std": 0.42497774958610535, + "rewards/format_reward_func/mean": 0.2500000298023224, + "rewards/format_reward_func/std": 0.11677484214305878, + "step": 665 + }, + { + "completion_length": 2563.0001220703125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5240.0, + "completions/max_terminated_length": 5240.0, + "completions/mean_length": 2563.0, + "completions/mean_terminated_length": 2563.0, + "completions/min_length": 923.0, + "completions/min_terminated_length": 923.0, + "epoch": 0.2259158751696065, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.5822378396987915, + "kl": 0.0, + "learning_rate": 3.9389233954451344e-07, + "loss": 0.026, + "num_tokens": 23078463.0, + "reward": 1.066666603088379, + "reward_std": 0.20655912160873413, + "rewards/correctness_reward_func/mean": 0.7666666507720947, + "rewards/correctness_reward_func/std": 0.3700941801071167, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 666 + }, + { + "completion_length": 2461.3333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4158.0, + "completions/max_terminated_length": 4158.0, + "completions/mean_length": 2461.33349609375, + "completions/mean_terminated_length": 2461.33349609375, + "completions/min_length": 1184.0, + "completions/min_terminated_length": 1184.0, + "epoch": 0.2262550881953867, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.4431849137963582e-07, + "kl": 0.0, + "learning_rate": 3.9371980676328504e-07, + "loss": 0.0, + "num_tokens": 23119081.0, + "reward": 0.7000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 667 + }, + { + "completion_length": 1863.3333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4872.0, + "completions/max_terminated_length": 4872.0, + "completions/mean_length": 1863.3333740234375, + "completions/mean_terminated_length": 1863.3333740234375, + "completions/min_length": 512.0, + "completions/min_terminated_length": 512.0, + "epoch": 0.2265943012211669, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14633503556251526, + "kl": 0.0, + "learning_rate": 3.9354727398205654e-07, + "loss": 0.0029, + "num_tokens": 23151605.0, + "reward": 1.2166666984558105, + "reward_std": 0.09246456623077393, + "rewards/correctness_reward_func/mean": 0.9166666865348816, + "rewards/correctness_reward_func/std": 0.10298573225736618, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 668 + }, + { + "completion_length": 1864.8333740234375, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 4246.0, + "completions/mean_length": 2413.916748046875, + "completions/mean_terminated_length": 2034.3636474609375, + "completions/min_length": 911.0, + "completions/min_terminated_length": 911.0, + "epoch": 0.22693351424694708, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5761399269104004, + "kl": NaN, + "learning_rate": 3.9337474120082815e-07, + "loss": -0.0322, + "num_tokens": 23187561.0, + "reward": 0.8250000476837158, + "reward_std": 0.27409863471984863, + "rewards/correctness_reward_func/mean": 0.550000011920929, + "rewards/correctness_reward_func/std": 0.4100997745990753, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.08660254627466202, + "step": 669 + }, + { + "completion_length": 1130.9166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1708.0, + "completions/max_terminated_length": 1708.0, + "completions/mean_length": 1130.916748046875, + "completions/mean_terminated_length": 1130.916748046875, + "completions/min_length": 778.0, + "completions/min_terminated_length": 778.0, + "epoch": 0.22727272727272727, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 3.932022084195997e-07, + "loss": 0.0, + "num_tokens": 23216174.0, + "reward": 1.2999999523162842, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 1.0, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 670 + }, + { + "completion_length": 1128.4166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2095.0, + "completions/max_terminated_length": 2095.0, + "completions/mean_length": 1128.416748046875, + "completions/mean_terminated_length": 1128.416748046875, + "completions/min_length": 642.0, + "completions/min_terminated_length": 642.0, + "epoch": 0.22761194029850745, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.09337574988603592, + "kl": 0.0, + "learning_rate": 3.9302967563837126e-07, + "loss": 0.0024, + "num_tokens": 23244121.0, + "reward": 0.75, + "reward_std": 0.05477222427725792, + "rewards/correctness_reward_func/mean": 0.44999995827674866, + "rewards/correctness_reward_func/std": 0.47577688097953796, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 671 + }, + { + "completion_length": 1535.9166870117188, + "completions/clipped_ratio": 0.5, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5545.0, + "completions/mean_length": 4830.4169921875, + "completions/mean_terminated_length": 3071.83349609375, + "completions/min_length": 1840.0, + "completions/min_terminated_length": 1840.0, + "epoch": 0.22795115332428764, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6959670782089233, + "kl": NaN, + "learning_rate": 3.928571428571428e-07, + "loss": -0.0068, + "num_tokens": 23274990.0, + "reward": 0.28333336114883423, + "reward_std": 0.20655912160873413, + "rewards/correctness_reward_func/mean": 0.13333334028720856, + "rewards/correctness_reward_func/std": 0.31139957904815674, + "rewards/format_reward_func/mean": 0.15000000596046448, + "rewards/format_reward_func/std": 0.15666989982128143, + "step": 672 + }, + { + "completion_length": 1835.8333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3286.0, + "completions/max_terminated_length": 3286.0, + "completions/mean_length": 1835.8333740234375, + "completions/mean_terminated_length": 1835.8333740234375, + "completions/min_length": 767.0, + "completions/min_terminated_length": 767.0, + "epoch": 0.22829036635006783, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.10714925825595856, + "kl": 0.0, + "learning_rate": 3.926846100759144e-07, + "loss": 0.0022, + "num_tokens": 23310532.0, + "reward": 1.1666667461395264, + "reward_std": 0.051639750599861145, + "rewards/correctness_reward_func/mean": 0.8666666150093079, + "rewards/correctness_reward_func/std": 0.0984731912612915, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 673 + }, + { + "completion_length": 1179.166748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2333.0, + "completions/max_terminated_length": 2333.0, + "completions/mean_length": 1179.166748046875, + "completions/mean_terminated_length": 1179.166748046875, + "completions/min_length": 505.0, + "completions/min_terminated_length": 505.0, + "epoch": 0.22862957937584802, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.31352046853317e-07, + "kl": 0.0, + "learning_rate": 3.92512077294686e-07, + "loss": 0.0, + "num_tokens": 23339454.0, + "reward": 1.1000001430511475, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 674 + }, + { + "completion_length": 2697.08349609375, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5319.0, + "completions/mean_length": 3246.166748046875, + "completions/mean_terminated_length": 2942.272705078125, + "completions/min_length": 974.0, + "completions/min_terminated_length": 974.0, + "epoch": 0.22896879240162823, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6637680530548096, + "kl": NaN, + "learning_rate": 3.923395445134575e-07, + "loss": -0.0302, + "num_tokens": 23380981.0, + "reward": 0.8583332896232605, + "reward_std": 0.22453653812408447, + "rewards/correctness_reward_func/mean": 0.5833333134651184, + "rewards/correctness_reward_func/std": 0.5149286389350891, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.08660253882408142, + "step": 675 + }, + { + "completion_length": 1928.8333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3160.0, + "completions/max_terminated_length": 3160.0, + "completions/mean_length": 1928.8333740234375, + "completions/mean_terminated_length": 1928.8333740234375, + "completions/min_length": 953.0, + "completions/min_terminated_length": 953.0, + "epoch": 0.22930800542740842, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.10171937942504883, + "kl": 0.0, + "learning_rate": 3.921670117322291e-07, + "loss": -0.0015, + "num_tokens": 23414105.0, + "reward": 1.1666667461395264, + "reward_std": 0.051639750599861145, + "rewards/correctness_reward_func/mean": 0.8666666150093079, + "rewards/correctness_reward_func/std": 0.0984731912612915, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 676 + }, + { + "completion_length": 547.8333435058594, + "completions/clipped_ratio": 0.0, + "completions/max_length": 844.0, + "completions/max_terminated_length": 844.0, + "completions/mean_length": 547.8333740234375, + "completions/mean_terminated_length": 547.8333740234375, + "completions/min_length": 348.0, + "completions/min_terminated_length": 348.0, + "epoch": 0.2296472184531886, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.059833355247974396, + "kl": 0.0, + "learning_rate": 3.919944789510007e-07, + "loss": 0.0003, + "num_tokens": 23431623.0, + "reward": 1.133333444595337, + "reward_std": 0.051639772951602936, + "rewards/correctness_reward_func/mean": 0.8333333134651184, + "rewards/correctness_reward_func/std": 0.07784988731145859, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 677 + }, + { + "completion_length": 1936.7500610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4324.0, + "completions/max_terminated_length": 4324.0, + "completions/mean_length": 1936.75, + "completions/mean_terminated_length": 1936.75, + "completions/min_length": 579.0, + "completions/min_terminated_length": 579.0, + "epoch": 0.2299864314789688, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.4380330443382263, + "kl": 0.0, + "learning_rate": 3.918219461697723e-07, + "loss": -0.0158, + "num_tokens": 23468760.0, + "reward": 1.133333444595337, + "reward_std": 0.1632993221282959, + "rewards/correctness_reward_func/mean": 0.8333333134651184, + "rewards/correctness_reward_func/std": 0.28069180250167847, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 678 + }, + { + "completion_length": 910.4166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1362.0, + "completions/max_terminated_length": 1362.0, + "completions/mean_length": 910.4166870117188, + "completions/mean_terminated_length": 910.4166870117188, + "completions/min_length": 503.0, + "completions/min_terminated_length": 503.0, + "epoch": 0.230325644504749, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.0412031770101748e-07, + "kl": 0.0, + "learning_rate": 3.916494133885438e-07, + "loss": 0.0, + "num_tokens": 23489765.0, + "reward": 1.2000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 679 + }, + { + "completion_length": 1419.8333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2880.0, + "completions/max_terminated_length": 2880.0, + "completions/mean_length": 1419.8333740234375, + "completions/mean_terminated_length": 1419.8333740234375, + "completions/min_length": 559.0, + "completions/min_terminated_length": 559.0, + "epoch": 0.23066485753052918, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.11209948360919952, + "kl": 0.0, + "learning_rate": 3.914768806073154e-07, + "loss": -0.0005, + "num_tokens": 23520645.0, + "reward": 1.1500000953674316, + "reward_std": 0.05477223917841911, + "rewards/correctness_reward_func/mean": 0.8500000834465027, + "rewards/correctness_reward_func/std": 0.09045339375734329, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 680 + }, + { + "completion_length": 2968.7501220703125, + "completions/clipped_ratio": 0.16666666666666663, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 6533.0, + "completions/mean_length": 4066.916748046875, + "completions/mean_terminated_length": 3562.5, + "completions/min_length": 2128.0, + "completions/min_terminated_length": 2128.0, + "epoch": 0.23100407055630937, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.10447150468826294, + "kl": NaN, + "learning_rate": 3.9130434782608694e-07, + "loss": -0.0173, + "num_tokens": 23564442.0, + "reward": 0.6625000834465027, + "reward_std": 0.06274950504302979, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.26250001788139343, + "rewards/format_reward_func/std": 0.09323723614215851, + "step": 681 + }, + { + "completion_length": 731.0833740234375, + "completions/clipped_ratio": 0.33333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 1945.0, + "completions/mean_length": 2927.416748046875, + "completions/mean_terminated_length": 1096.625, + "completions/min_length": 620.0, + "completions/min_terminated_length": 620.0, + "epoch": 0.23134328358208955, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.145811066031456, + "kl": NaN, + "learning_rate": 3.911318150448585e-07, + "loss": -0.0086, + "num_tokens": 23589685.0, + "reward": 0.699999988079071, + "reward_std": 0.0774596780538559, + "rewards/correctness_reward_func/mean": 0.5, + "rewards/correctness_reward_func/std": 0.5222329497337341, + "rewards/format_reward_func/mean": 0.20000000298023224, + "rewards/format_reward_func/std": 0.14770980179309845, + "step": 682 + }, + { + "completion_length": 841.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1748.0, + "completions/max_terminated_length": 1748.0, + "completions/mean_length": 841.0, + "completions/mean_terminated_length": 841.0, + "completions/min_length": 505.0, + "completions/min_terminated_length": 505.0, + "epoch": 0.23168249660786974, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.2751244008541107, + "kl": 0.0, + "learning_rate": 3.9095928226363005e-07, + "loss": -0.0016, + "num_tokens": 23610985.0, + "reward": 0.9666668176651001, + "reward_std": 0.20655910670757294, + "rewards/correctness_reward_func/mean": 0.6666666865348816, + "rewards/correctness_reward_func/std": 0.31139957904815674, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 683 + }, + { + "completion_length": 2837.666748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5740.0, + "completions/max_terminated_length": 5740.0, + "completions/mean_length": 2837.666748046875, + "completions/mean_terminated_length": 2837.666748046875, + "completions/min_length": 1397.0, + "completions/min_terminated_length": 1397.0, + "epoch": 0.23202170963364993, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0925978422164917, + "kl": 0.0, + "learning_rate": 3.9078674948240165e-07, + "loss": -0.0017, + "num_tokens": 23660457.0, + "reward": 1.2833333015441895, + "reward_std": 0.040824808180332184, + "rewards/correctness_reward_func/mean": 0.9833333492279053, + "rewards/correctness_reward_func/std": 0.05773502215743065, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 684 + }, + { + "completion_length": 1592.0000610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4128.0, + "completions/max_terminated_length": 4128.0, + "completions/mean_length": 1592.0, + "completions/mean_terminated_length": 1592.0, + "completions/min_length": 664.0, + "completions/min_terminated_length": 664.0, + "epoch": 0.23236092265943012, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6901833415031433, + "kl": 0.0, + "learning_rate": 3.906142167011732e-07, + "loss": -0.0094, + "num_tokens": 23688075.0, + "reward": 0.9666666388511658, + "reward_std": 0.2581988573074341, + "rewards/correctness_reward_func/mean": 0.6666666865348816, + "rewards/correctness_reward_func/std": 0.4923659861087799, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 685 + }, + { + "completion_length": 566.2500305175781, + "completions/clipped_ratio": 0.0, + "completions/max_length": 919.0, + "completions/max_terminated_length": 919.0, + "completions/mean_length": 566.25, + "completions/mean_terminated_length": 566.25, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "epoch": 0.2327001356852103, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23003122210502625, + "kl": 0.0, + "learning_rate": 3.9044168391994476e-07, + "loss": 0.0012, + "num_tokens": 23707470.0, + "reward": 1.1166666746139526, + "reward_std": 0.20412413775920868, + "rewards/correctness_reward_func/mean": 0.8166666030883789, + "rewards/correctness_reward_func/std": 0.27579087018966675, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 686 + }, + { + "completion_length": 1287.6666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2399.0, + "completions/max_terminated_length": 2399.0, + "completions/mean_length": 1287.666748046875, + "completions/mean_terminated_length": 1287.666748046875, + "completions/min_length": 639.0, + "completions/min_terminated_length": 639.0, + "epoch": 0.2330393487109905, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.09936317056417465, + "kl": 0.0, + "learning_rate": 3.902691511387163e-07, + "loss": 0.0, + "num_tokens": 23736716.0, + "reward": 1.1500000953674316, + "reward_std": 0.05477223917841911, + "rewards/correctness_reward_func/mean": 0.8500000834465027, + "rewards/correctness_reward_func/std": 0.09045339375734329, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 687 + }, + { + "completion_length": 1156.2500305175781, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4769.0, + "completions/max_terminated_length": 4769.0, + "completions/mean_length": 1156.25, + "completions/mean_terminated_length": 1156.25, + "completions/min_length": 576.0, + "completions/min_terminated_length": 576.0, + "epoch": 0.23337856173677068, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 3.900966183574879e-07, + "loss": 0.0, + "num_tokens": 23758805.0, + "reward": 0.30000001192092896, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.0, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 688 + }, + { + "completion_length": 1892.5000610351562, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 6275.0, + "completions/mean_length": 2441.58349609375, + "completions/mean_terminated_length": 2064.54541015625, + "completions/min_length": 513.0, + "completions/min_terminated_length": 513.0, + "epoch": 0.23371777476255087, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6890511512756348, + "kl": NaN, + "learning_rate": 3.8992408557625947e-07, + "loss": -0.0588, + "num_tokens": 23796545.0, + "reward": 1.0375001430511475, + "reward_std": 0.2679903209209442, + "rewards/correctness_reward_func/mean": 0.75, + "rewards/correctness_reward_func/std": 0.36306774616241455, + "rewards/format_reward_func/mean": 0.2875000238418579, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 689 + }, + { + "completion_length": 2770.3333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4142.0, + "completions/max_terminated_length": 4142.0, + "completions/mean_length": 2770.33349609375, + "completions/mean_terminated_length": 2770.33349609375, + "completions/min_length": 1178.0, + "completions/min_terminated_length": 1178.0, + "epoch": 0.23405698778833106, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.734200656414032, + "kl": 0.0, + "learning_rate": 3.89751552795031e-07, + "loss": -0.0003, + "num_tokens": 23845119.0, + "reward": 1.0166666507720947, + "reward_std": 0.440767377614975, + "rewards/correctness_reward_func/mean": 0.7166666984558105, + "rewards/correctness_reward_func/std": 0.4386618733406067, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 690 + }, + { + "completion_length": 1363.5833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1962.0, + "completions/max_terminated_length": 1962.0, + "completions/mean_length": 1363.5833740234375, + "completions/mean_terminated_length": 1363.5833740234375, + "completions/min_length": 693.0, + "completions/min_terminated_length": 693.0, + "epoch": 0.23439620081411125, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.2079890982240613e-07, + "kl": 0.0, + "learning_rate": 3.8957902001380263e-07, + "loss": 0.0, + "num_tokens": 23873902.0, + "reward": 1.2000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 691 + }, + { + "completion_length": 822.8333435058594, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1576.0, + "completions/max_terminated_length": 1576.0, + "completions/mean_length": 822.8333740234375, + "completions/mean_terminated_length": 822.8333740234375, + "completions/min_length": 438.0, + "completions/min_terminated_length": 438.0, + "epoch": 0.23473541383989144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10281941294670105, + "kl": 0.0, + "learning_rate": 3.894064872325742e-07, + "loss": -0.0002, + "num_tokens": 23895980.0, + "reward": 1.1500000953674316, + "reward_std": 0.09246459603309631, + "rewards/correctness_reward_func/mean": 0.8500000834465027, + "rewards/correctness_reward_func/std": 0.09045339375734329, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 692 + }, + { + "completion_length": 699.5833435058594, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1049.0, + "completions/max_terminated_length": 1049.0, + "completions/mean_length": 699.5833740234375, + "completions/mean_terminated_length": 699.5833740234375, + "completions/min_length": 359.0, + "completions/min_terminated_length": 359.0, + "epoch": 0.23507462686567165, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.28774434328079224, + "kl": 0.0, + "learning_rate": 3.8923395445134574e-07, + "loss": -0.0007, + "num_tokens": 23916567.0, + "reward": 0.8833333849906921, + "reward_std": 0.20412413775920868, + "rewards/correctness_reward_func/mean": 0.5833333134651184, + "rewards/correctness_reward_func/std": 0.5149286389350891, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 693 + }, + { + "completion_length": 1013.0000610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2550.0, + "completions/max_terminated_length": 2550.0, + "completions/mean_length": 1013.0, + "completions/mean_terminated_length": 1013.0, + "completions/min_length": 566.0, + "completions/min_terminated_length": 566.0, + "epoch": 0.23541383989145184, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.07994963973760605, + "kl": 0.0, + "learning_rate": 3.890614216701173e-07, + "loss": 0.0024, + "num_tokens": 23944047.0, + "reward": 1.2666666507720947, + "reward_std": 0.05163975805044174, + "rewards/correctness_reward_func/mean": 0.9666666984558105, + "rewards/correctness_reward_func/std": 0.07784988731145859, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 694 + }, + { + "completion_length": 1904.916748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3562.0, + "completions/max_terminated_length": 3562.0, + "completions/mean_length": 1904.916748046875, + "completions/mean_terminated_length": 1904.916748046875, + "completions/min_length": 606.0, + "completions/min_terminated_length": 606.0, + "epoch": 0.23575305291723203, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.09583219140768051, + "kl": 0.0, + "learning_rate": 3.888888888888889e-07, + "loss": -0.0006, + "num_tokens": 23977784.0, + "reward": 1.183333396911621, + "reward_std": 0.040824808180332184, + "rewards/correctness_reward_func/mean": 0.8833333849906921, + "rewards/correctness_reward_func/std": 0.10298572480678558, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 695 + }, + { + "completion_length": 876.8333435058594, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1769.0, + "completions/max_terminated_length": 1769.0, + "completions/mean_length": 876.8333740234375, + "completions/mean_terminated_length": 876.8333740234375, + "completions/min_length": 360.0, + "completions/min_terminated_length": 360.0, + "epoch": 0.23609226594301222, + "frac_reward_zero_std": 1.0, + "grad_norm": 7.380832300896145e-08, + "kl": 0.0, + "learning_rate": 3.8871635610766045e-07, + "loss": 0.0, + "num_tokens": 24001866.0, + "reward": 1.2000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 696 + }, + { + "completion_length": 1457.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3931.0, + "completions/max_terminated_length": 3931.0, + "completions/mean_length": 1457.5, + "completions/mean_terminated_length": 1457.5, + "completions/min_length": 695.0, + "completions/min_terminated_length": 695.0, + "epoch": 0.2364314789687924, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.5751703381538391, + "kl": 0.0, + "learning_rate": 3.88543823326432e-07, + "loss": -0.022, + "num_tokens": 24030342.0, + "reward": 0.9000000953674316, + "reward_std": 0.21908903121948242, + "rewards/correctness_reward_func/mean": 0.6000000238418579, + "rewards/correctness_reward_func/std": 0.36181363463401794, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 697 + }, + { + "completion_length": 1276.666748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1866.0, + "completions/max_terminated_length": 1866.0, + "completions/mean_length": 1276.666748046875, + "completions/mean_terminated_length": 1276.666748046875, + "completions/min_length": 876.0, + "completions/min_terminated_length": 876.0, + "epoch": 0.2367706919945726, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5665131211280823, + "kl": 0.0, + "learning_rate": 3.8837129054520355e-07, + "loss": -0.0076, + "num_tokens": 24057146.0, + "reward": 1.066666841506958, + "reward_std": 0.3098386526107788, + "rewards/correctness_reward_func/mean": 0.7666666507720947, + "rewards/correctness_reward_func/std": 0.3700941801071167, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 698 + }, + { + "completion_length": 1514.3333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3280.0, + "completions/max_terminated_length": 3280.0, + "completions/mean_length": 1514.3333740234375, + "completions/mean_terminated_length": 1514.3333740234375, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 0.23710990502035278, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4253982901573181, + "kl": 0.0, + "learning_rate": 3.8819875776397516e-07, + "loss": 0.0088, + "num_tokens": 24085560.0, + "reward": 0.7000000476837158, + "reward_std": 0.3265986442565918, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 699 + }, + { + "completion_length": 1934.5833740234375, + "completions/clipped_ratio": 0.16666666666666663, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5187.0, + "completions/mean_length": 3032.75, + "completions/mean_terminated_length": 2321.5, + "completions/min_length": 951.0, + "completions/min_terminated_length": 951.0, + "epoch": 0.23744911804613297, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.4350591003894806, + "kl": NaN, + "learning_rate": 3.880262249827467e-07, + "loss": -0.0673, + "num_tokens": 24118171.0, + "reward": 0.9166667461395264, + "reward_std": 0.2840188145637512, + "rewards/correctness_reward_func/mean": 0.6666666865348816, + "rewards/correctness_reward_func/std": 0.31139957904815674, + "rewards/format_reward_func/mean": 0.25, + "rewards/format_reward_func/std": 0.11677484214305878, + "step": 700 + }, + { + "completion_length": 1958.0000610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4626.0, + "completions/max_terminated_length": 4626.0, + "completions/mean_length": 1958.0, + "completions/mean_terminated_length": 1958.0, + "completions/min_length": 797.0, + "completions/min_terminated_length": 797.0, + "epoch": 0.23778833107191316, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.35070163011550903, + "kl": 0.0, + "learning_rate": 3.8785369220151826e-07, + "loss": 0.0025, + "num_tokens": 24153667.0, + "reward": 1.1000001430511475, + "reward_std": 0.23490385711193085, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.2696799635887146, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 701 + }, + { + "completion_length": 1124.5000610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2765.0, + "completions/max_terminated_length": 2765.0, + "completions/mean_length": 1124.5, + "completions/mean_terminated_length": 1124.5, + "completions/min_length": 400.0, + "completions/min_terminated_length": 400.0, + "epoch": 0.23812754409769335, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.11582597345113754, + "kl": 0.0, + "learning_rate": 3.876811594202898e-07, + "loss": -0.0012, + "num_tokens": 24179749.0, + "reward": 1.1500000953674316, + "reward_std": 0.05477223917841911, + "rewards/correctness_reward_func/mean": 0.8500000834465027, + "rewards/correctness_reward_func/std": 0.09045339375734329, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 702 + }, + { + "completion_length": 1943.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4035.0, + "completions/max_terminated_length": 4035.0, + "completions/mean_length": 1943.25, + "completions/mean_terminated_length": 1943.25, + "completions/min_length": 519.0, + "completions/min_terminated_length": 519.0, + "epoch": 0.23846675712347354, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.11352552473545074, + "kl": 0.0, + "learning_rate": 3.875086266390614e-07, + "loss": 0.0012, + "num_tokens": 24213964.0, + "reward": 1.1500000953674316, + "reward_std": 0.05477223917841911, + "rewards/correctness_reward_func/mean": 0.8500000834465027, + "rewards/correctness_reward_func/std": 0.09045339375734329, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 703 + }, + { + "completion_length": 1198.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2858.0, + "completions/max_terminated_length": 2858.0, + "completions/mean_length": 1198.25, + "completions/mean_terminated_length": 1198.25, + "completions/min_length": 496.0, + "completions/min_terminated_length": 496.0, + "epoch": 0.23880597014925373, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.07122494280338287, + "kl": 0.0, + "learning_rate": 3.873360938578329e-07, + "loss": 0.001, + "num_tokens": 24244057.0, + "reward": 1.2166666984558105, + "reward_std": 0.040824826806783676, + "rewards/correctness_reward_func/mean": 0.9166666865348816, + "rewards/correctness_reward_func/std": 0.10298573225736618, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 704 + }, + { + "completion_length": 2193.666748046875, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 3976.0, + "completions/mean_length": 2742.75, + "completions/mean_terminated_length": 2393.091064453125, + "completions/min_length": 843.0, + "completions/min_terminated_length": 843.0, + "epoch": 0.23914518317503392, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.09500417858362198, + "kl": NaN, + "learning_rate": 3.8716356107660453e-07, + "loss": -0.0105, + "num_tokens": 24282195.0, + "reward": 0.6750000715255737, + "reward_std": 0.06123724579811096, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.08660253882408142, + "step": 705 + }, + { + "completion_length": 2630.5833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5251.0, + "completions/max_terminated_length": 5251.0, + "completions/mean_length": 2630.58349609375, + "completions/mean_terminated_length": 2630.58349609375, + "completions/min_length": 829.0, + "completions/min_terminated_length": 829.0, + "epoch": 0.2394843962008141, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9753767848014832, + "kl": 0.0, + "learning_rate": 3.8699102829537613e-07, + "loss": -0.0168, + "num_tokens": 24327838.0, + "reward": 1.1000001430511475, + "reward_std": 0.3098386526107788, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.38138505816459656, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 706 + }, + { + "completion_length": 1942.7500610351562, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 4530.0, + "completions/mean_length": 2491.83349609375, + "completions/mean_terminated_length": 2119.36376953125, + "completions/min_length": 513.0, + "completions/min_terminated_length": 513.0, + "epoch": 0.2398236092265943, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2561108469963074, + "kl": NaN, + "learning_rate": 3.868184955141477e-07, + "loss": -0.0457, + "num_tokens": 24366193.0, + "reward": 1.0416667461395264, + "reward_std": 0.2761763334274292, + "rewards/correctness_reward_func/mean": 0.7666667103767395, + "rewards/correctness_reward_func/std": 0.2534608840942383, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.08660254627466202, + "step": 707 + }, + { + "completion_length": 1299.0000610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2080.0, + "completions/max_terminated_length": 2080.0, + "completions/mean_length": 1299.0, + "completions/mean_terminated_length": 1299.0, + "completions/min_length": 559.0, + "completions/min_terminated_length": 559.0, + "epoch": 0.24016282225237448, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.11090793460607529, + "kl": 0.0, + "learning_rate": 3.8664596273291924e-07, + "loss": -0.0001, + "num_tokens": 24393121.0, + "reward": 0.7666666507720947, + "reward_std": 0.05163975805044174, + "rewards/correctness_reward_func/mean": 0.46666666865348816, + "rewards/correctness_reward_func/std": 0.4923659563064575, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 708 + }, + { + "completion_length": 2367.7500610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5939.0, + "completions/max_terminated_length": 5939.0, + "completions/mean_length": 2367.75, + "completions/mean_terminated_length": 2367.75, + "completions/min_length": 1082.0, + "completions/min_terminated_length": 1082.0, + "epoch": 0.24050203527815467, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.2568166830060363e-07, + "kl": 0.0, + "learning_rate": 3.864734299516908e-07, + "loss": 0.0, + "num_tokens": 24432862.0, + "reward": 0.7000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 709 + }, + { + "completion_length": 1470.6666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2653.0, + "completions/max_terminated_length": 2653.0, + "completions/mean_length": 1470.666748046875, + "completions/mean_terminated_length": 1470.666748046875, + "completions/min_length": 611.0, + "completions/min_terminated_length": 611.0, + "epoch": 0.24084124830393486, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.8426899234546e-07, + "kl": 0.0, + "learning_rate": 3.863008971704624e-07, + "loss": 0.0, + "num_tokens": 24462162.0, + "reward": 1.2000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 710 + }, + { + "completion_length": 2114.2501220703125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6433.0, + "completions/max_terminated_length": 6433.0, + "completions/mean_length": 2114.25, + "completions/mean_terminated_length": 2114.25, + "completions/min_length": 487.0, + "completions/min_terminated_length": 487.0, + "epoch": 0.24118046132971507, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.05685143172740936, + "kl": 0.0, + "learning_rate": 3.8612836438923395e-07, + "loss": 0.0, + "num_tokens": 24495963.0, + "reward": 1.1166667938232422, + "reward_std": 0.040824826806783676, + "rewards/correctness_reward_func/mean": 0.8166666626930237, + "rewards/correctness_reward_func/std": 0.05773502215743065, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 711 + }, + { + "completion_length": 929.1666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1821.0, + "completions/max_terminated_length": 1821.0, + "completions/mean_length": 929.1666870117188, + "completions/mean_terminated_length": 929.1666870117188, + "completions/min_length": 474.0, + "completions/min_terminated_length": 474.0, + "epoch": 0.24151967435549526, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.06048990786075592, + "kl": 0.0, + "learning_rate": 3.859558316080055e-07, + "loss": -0.0, + "num_tokens": 24517901.0, + "reward": 1.183333396911621, + "reward_std": 0.040824808180332184, + "rewards/correctness_reward_func/mean": 0.8833333849906921, + "rewards/correctness_reward_func/std": 0.10298572480678558, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 712 + }, + { + "completion_length": 1756.3333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3761.0, + "completions/max_terminated_length": 3761.0, + "completions/mean_length": 1756.3333740234375, + "completions/mean_terminated_length": 1756.3333740234375, + "completions/min_length": 840.0, + "completions/min_terminated_length": 840.0, + "epoch": 0.24185888738127545, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.14410604536533356, + "kl": 0.0, + "learning_rate": 3.8578329882677706e-07, + "loss": 0.0002, + "num_tokens": 24553629.0, + "reward": 1.1541666984558105, + "reward_std": 0.07486096024513245, + "rewards/correctness_reward_func/mean": 0.8666666150093079, + "rewards/correctness_reward_func/std": 0.0984731912612915, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 713 + }, + { + "completion_length": 2858.5001220703125, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5566.0, + "completions/mean_length": 3407.58349609375, + "completions/mean_terminated_length": 3118.36376953125, + "completions/min_length": 1465.0, + "completions/min_terminated_length": 1465.0, + "epoch": 0.24219810040705564, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2787613272666931, + "kl": NaN, + "learning_rate": 3.8561076604554866e-07, + "loss": -0.0455, + "num_tokens": 24603819.0, + "reward": 1.0916666984558105, + "reward_std": 0.26536136865615845, + "rewards/correctness_reward_func/mean": 0.8166666030883789, + "rewards/correctness_reward_func/std": 0.27579087018966675, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.08660253882408142, + "step": 714 + }, + { + "completion_length": 2805.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6465.0, + "completions/max_terminated_length": 6465.0, + "completions/mean_length": 2805.5, + "completions/mean_terminated_length": 2805.5, + "completions/min_length": 967.0, + "completions/min_terminated_length": 967.0, + "epoch": 0.24253731343283583, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.48999252915382385, + "kl": 0.0, + "learning_rate": 3.8543823326432016e-07, + "loss": 0.0095, + "num_tokens": 24648591.0, + "reward": 0.38333332538604736, + "reward_std": 0.20412413775920868, + "rewards/correctness_reward_func/mean": 0.0833333358168602, + "rewards/correctness_reward_func/std": 0.28867512941360474, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 715 + }, + { + "completion_length": 2720.1666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5956.0, + "completions/max_terminated_length": 5956.0, + "completions/mean_length": 2720.166748046875, + "completions/mean_terminated_length": 2720.166748046875, + "completions/min_length": 774.0, + "completions/min_terminated_length": 774.0, + "epoch": 0.24287652645861602, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.42055872082710266, + "kl": 0.0, + "learning_rate": 3.8526570048309177e-07, + "loss": -0.0159, + "num_tokens": 24692747.0, + "reward": 0.6333333253860474, + "reward_std": 0.1632993221282959, + "rewards/correctness_reward_func/mean": 0.3333333432674408, + "rewards/correctness_reward_func/std": 0.4119429290294647, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 716 + }, + { + "completion_length": 1547.4166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2570.0, + "completions/max_terminated_length": 2570.0, + "completions/mean_length": 1547.416748046875, + "completions/mean_terminated_length": 1547.416748046875, + "completions/min_length": 846.0, + "completions/min_terminated_length": 846.0, + "epoch": 0.2432157394843962, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.07790729403495789, + "kl": 0.0, + "learning_rate": 3.850931677018633e-07, + "loss": -0.0014, + "num_tokens": 24721240.0, + "reward": 1.1666667461395264, + "reward_std": 0.051639750599861145, + "rewards/correctness_reward_func/mean": 0.8666666150093079, + "rewards/correctness_reward_func/std": 0.0984731912612915, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 717 + }, + { + "completion_length": 1684.5000610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4692.0, + "completions/max_terminated_length": 4692.0, + "completions/mean_length": 1684.5, + "completions/mean_terminated_length": 1684.5, + "completions/min_length": 559.0, + "completions/min_terminated_length": 559.0, + "epoch": 0.2435549525101764, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.09478975832462311, + "kl": 0.0, + "learning_rate": 3.8492063492063493e-07, + "loss": -0.0023, + "num_tokens": 24754684.0, + "reward": 1.133333444595337, + "reward_std": 0.05163976177573204, + "rewards/correctness_reward_func/mean": 0.8333333134651184, + "rewards/correctness_reward_func/std": 0.07784988731145859, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 718 + }, + { + "completion_length": 3755.0001220703125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6407.0, + "completions/max_terminated_length": 6407.0, + "completions/mean_length": 3755.0, + "completions/mean_terminated_length": 3755.0, + "completions/min_length": 2256.0, + "completions/min_terminated_length": 2256.0, + "epoch": 0.24389416553595658, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 3.8474810213940643e-07, + "loss": 0.0, + "num_tokens": 24810616.0, + "reward": 0.7999999523162842, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.5, + "rewards/correctness_reward_func/std": 0.5222329497337341, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 719 + }, + { + "completion_length": 1573.3333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2848.0, + "completions/max_terminated_length": 2848.0, + "completions/mean_length": 1573.3333740234375, + "completions/mean_terminated_length": 1573.3333740234375, + "completions/min_length": 851.0, + "completions/min_terminated_length": 851.0, + "epoch": 0.24423337856173677, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.08259549736976624, + "kl": 0.0, + "learning_rate": 3.8457556935817803e-07, + "loss": -0.0009, + "num_tokens": 24840470.0, + "reward": 1.183333396911621, + "reward_std": 0.040824808180332184, + "rewards/correctness_reward_func/mean": 0.8833333849906921, + "rewards/correctness_reward_func/std": 0.10298572480678558, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 720 + }, + { + "completion_length": 1527.416748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4029.0, + "completions/max_terminated_length": 4029.0, + "completions/mean_length": 1527.416748046875, + "completions/mean_terminated_length": 1527.416748046875, + "completions/min_length": 735.0, + "completions/min_terminated_length": 735.0, + "epoch": 0.24457259158751696, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.507327139377594, + "kl": 0.0, + "learning_rate": 3.8440303657694964e-07, + "loss": -0.022, + "num_tokens": 24872401.0, + "reward": 1.0, + "reward_std": 0.21908903121948242, + "rewards/correctness_reward_func/mean": 0.699999988079071, + "rewards/correctness_reward_func/std": 0.43064433336257935, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 721 + }, + { + "completion_length": 2963.916748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4433.0, + "completions/max_terminated_length": 4433.0, + "completions/mean_length": 2963.916748046875, + "completions/mean_terminated_length": 2963.916748046875, + "completions/min_length": 1830.0, + "completions/min_terminated_length": 1830.0, + "epoch": 0.24491180461329715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5084398984909058, + "kl": 0.0, + "learning_rate": 3.842305037957212e-07, + "loss": 0.0097, + "num_tokens": 24920676.0, + "reward": 1.149999976158142, + "reward_std": 0.2270146608352661, + "rewards/correctness_reward_func/mean": 0.8499999642372131, + "rewards/correctness_reward_func/std": 0.28444522619247437, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 722 + }, + { + "completion_length": 2600.0833740234375, + "completions/clipped_ratio": 0.25, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5634.0, + "completions/mean_length": 4247.33349609375, + "completions/mean_terminated_length": 3466.77783203125, + "completions/min_length": 919.0, + "completions/min_terminated_length": 919.0, + "epoch": 0.24525101763907733, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0371900796890259, + "kl": NaN, + "learning_rate": 3.8405797101449274e-07, + "loss": -0.0943, + "num_tokens": 24960865.0, + "reward": 0.7083333730697632, + "reward_std": 0.4670211672782898, + "rewards/correctness_reward_func/mean": 0.4833333492279053, + "rewards/correctness_reward_func/std": 0.5078176856040955, + "rewards/format_reward_func/mean": 0.22500000894069672, + "rewards/format_reward_func/std": 0.13568010926246643, + "step": 723 + }, + { + "completion_length": 526.3333435058594, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1250.0, + "completions/max_terminated_length": 1250.0, + "completions/mean_length": 526.3333740234375, + "completions/mean_terminated_length": 526.3333740234375, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 0.24559023066485752, + "frac_reward_zero_std": 1.0, + "grad_norm": 7.004301494362153e-08, + "kl": 0.0, + "learning_rate": 3.838854382332643e-07, + "loss": 0.0, + "num_tokens": 24981371.0, + "reward": 1.2000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 724 + }, + { + "completion_length": 1995.166748046875, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 4928.0, + "completions/mean_length": 2544.25, + "completions/mean_terminated_length": 2176.54541015625, + "completions/min_length": 326.0, + "completions/min_terminated_length": 326.0, + "epoch": 0.2459294436906377, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.07325784862041473, + "kl": NaN, + "learning_rate": 3.837129054520359e-07, + "loss": -0.0124, + "num_tokens": 25017247.0, + "reward": 0.6750000715255737, + "reward_std": 0.06123724579811096, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.08660253882408142, + "step": 725 + }, + { + "completion_length": 1874.3334350585938, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4024.0, + "completions/max_terminated_length": 4024.0, + "completions/mean_length": 1874.3333740234375, + "completions/mean_terminated_length": 1874.3333740234375, + "completions/min_length": 585.0, + "completions/min_terminated_length": 585.0, + "epoch": 0.2462686567164179, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.9815544760604098e-07, + "kl": 0.0, + "learning_rate": 3.835403726708074e-07, + "loss": 0.0, + "num_tokens": 25055375.0, + "reward": 1.1000001430511475, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 726 + }, + { + "completion_length": 1643.5833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5250.0, + "completions/max_terminated_length": 5250.0, + "completions/mean_length": 1643.5833740234375, + "completions/mean_terminated_length": 1643.5833740234375, + "completions/min_length": 555.0, + "completions/min_terminated_length": 555.0, + "epoch": 0.2466078697421981, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.3413605276800808e-07, + "kl": 0.0, + "learning_rate": 3.83367839889579e-07, + "loss": 0.0, + "num_tokens": 25086828.0, + "reward": 1.2000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 727 + }, + { + "completion_length": 2202.2500610351562, + "completions/clipped_ratio": 0.5833333333333333, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 6381.0, + "completions/mean_length": 6045.83349609375, + "completions/mean_terminated_length": 5285.39990234375, + "completions/min_length": 3332.0, + "completions/min_terminated_length": 3332.0, + "epoch": 0.24694708276797828, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7284405827522278, + "kl": NaN, + "learning_rate": 3.8319530710835056e-07, + "loss": -0.1411, + "num_tokens": 25121163.0, + "reward": 0.33750003576278687, + "reward_std": 0.37498682737350464, + "rewards/correctness_reward_func/mean": 0.20000000298023224, + "rewards/correctness_reward_func/std": 0.36181363463401794, + "rewards/format_reward_func/mean": 0.13750000298023224, + "rewards/format_reward_func/std": 0.14943073689937592, + "step": 728 + }, + { + "completion_length": 1111.3333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1887.0, + "completions/max_terminated_length": 1887.0, + "completions/mean_length": 1111.3333740234375, + "completions/mean_terminated_length": 1111.3333740234375, + "completions/min_length": 764.0, + "completions/min_terminated_length": 764.0, + "epoch": 0.2472862957937585, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.3690761591078626e-07, + "kl": 0.0, + "learning_rate": 3.8302277432712217e-07, + "loss": 0.0, + "num_tokens": 25146583.0, + "reward": 0.7000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 729 + }, + { + "completion_length": 3647.250244140625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5731.0, + "completions/max_terminated_length": 5731.0, + "completions/mean_length": 3647.25, + "completions/mean_terminated_length": 3647.25, + "completions/min_length": 1535.0, + "completions/min_terminated_length": 1535.0, + "epoch": 0.24762550881953868, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6610546708106995, + "kl": 0.0, + "learning_rate": 3.8285024154589367e-07, + "loss": -0.0118, + "num_tokens": 25205158.0, + "reward": 0.7833334803581238, + "reward_std": 0.20412415266036987, + "rewards/correctness_reward_func/mean": 0.4833333492279053, + "rewards/correctness_reward_func/std": 0.43029239773750305, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 730 + }, + { + "completion_length": 2499.666748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6543.0, + "completions/max_terminated_length": 6543.0, + "completions/mean_length": 2499.666748046875, + "completions/mean_terminated_length": 2499.666748046875, + "completions/min_length": 608.0, + "completions/min_terminated_length": 608.0, + "epoch": 0.24796472184531887, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.07455330342054367, + "kl": 0.0, + "learning_rate": 3.8267770876466527e-07, + "loss": -0.0011, + "num_tokens": 25245858.0, + "reward": 0.7666666507720947, + "reward_std": 0.051639750599861145, + "rewards/correctness_reward_func/mean": 0.46666666865348816, + "rewards/correctness_reward_func/std": 0.4923659563064575, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 731 + }, + { + "completion_length": 1155.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2734.0, + "completions/max_terminated_length": 2734.0, + "completions/mean_length": 1155.25, + "completions/mean_terminated_length": 1155.25, + "completions/min_length": 532.0, + "completions/min_terminated_length": 532.0, + "epoch": 0.24830393487109906, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.08911774307489395, + "kl": 0.0, + "learning_rate": 3.825051759834368e-07, + "loss": 0.0029, + "num_tokens": 25270719.0, + "reward": 0.7833333015441895, + "reward_std": 0.040824808180332184, + "rewards/correctness_reward_func/mean": 0.4833333492279053, + "rewards/correctness_reward_func/std": 0.5078176856040955, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 732 + }, + { + "completion_length": 1258.8333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1898.0, + "completions/max_terminated_length": 1898.0, + "completions/mean_length": 1258.8333740234375, + "completions/mean_terminated_length": 1258.8333740234375, + "completions/min_length": 658.0, + "completions/min_terminated_length": 658.0, + "epoch": 0.24864314789687925, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.05099448561668396, + "kl": 0.0, + "learning_rate": 3.8233264320220843e-07, + "loss": -0.0012, + "num_tokens": 25292191.0, + "reward": 1.2833333015441895, + "reward_std": 0.040824808180332184, + "rewards/correctness_reward_func/mean": 0.9833333492279053, + "rewards/correctness_reward_func/std": 0.05773502215743065, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 733 + }, + { + "completion_length": 2122.7501220703125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2994.0, + "completions/max_terminated_length": 2994.0, + "completions/mean_length": 2122.75, + "completions/mean_terminated_length": 2122.75, + "completions/min_length": 971.0, + "completions/min_terminated_length": 971.0, + "epoch": 0.24898236092265943, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.335657000541687, + "kl": 0.0, + "learning_rate": 3.8216011042097993e-07, + "loss": 0.0078, + "num_tokens": 25329844.0, + "reward": 0.6333333253860474, + "reward_std": 0.1632993221282959, + "rewards/correctness_reward_func/mean": 0.3333333432674408, + "rewards/correctness_reward_func/std": 0.4119429290294647, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 734 + }, + { + "completion_length": 2080.7501220703125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5717.0, + "completions/max_terminated_length": 5717.0, + "completions/mean_length": 2080.75, + "completions/mean_terminated_length": 2080.75, + "completions/min_length": 529.0, + "completions/min_terminated_length": 529.0, + "epoch": 0.24932157394843962, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.0757533409087046e-07, + "kl": 0.0, + "learning_rate": 3.8198757763975154e-07, + "loss": 0.0, + "num_tokens": 25368103.0, + "reward": 0.7000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 735 + }, + { + "completion_length": 2086.5001220703125, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5375.0, + "completions/mean_length": 2635.58349609375, + "completions/mean_terminated_length": 2276.181884765625, + "completions/min_length": 965.0, + "completions/min_terminated_length": 965.0, + "epoch": 0.2496607869742198, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5441375374794006, + "kl": NaN, + "learning_rate": 3.818150448585231e-07, + "loss": 0.0135, + "num_tokens": 25404553.0, + "reward": 0.5416666865348816, + "reward_std": 0.2677963674068451, + "rewards/correctness_reward_func/mean": 0.2666666805744171, + "rewards/correctness_reward_func/std": 0.393892765045166, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.08660253882408142, + "step": 736 + }, + { + "completion_length": 1710.916748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3985.0, + "completions/max_terminated_length": 3985.0, + "completions/mean_length": 1710.916748046875, + "completions/mean_terminated_length": 1710.916748046875, + "completions/min_length": 476.0, + "completions/min_terminated_length": 476.0, + "epoch": 0.25, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.43407583236694336, + "kl": 0.0, + "learning_rate": 3.8164251207729464e-07, + "loss": 0.0034, + "num_tokens": 25435602.0, + "reward": 0.6333333253860474, + "reward_std": 0.1632993221282959, + "rewards/correctness_reward_func/mean": 0.3333333432674408, + "rewards/correctness_reward_func/std": 0.4119429290294647, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 737 + }, + { + "completion_length": 1790.5001220703125, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 4435.0, + "completions/mean_length": 2339.58349609375, + "completions/mean_terminated_length": 1953.2728271484375, + "completions/min_length": 725.0, + "completions/min_terminated_length": 725.0, + "epoch": 0.2503392130257802, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07025188952684402, + "kl": NaN, + "learning_rate": 3.8146997929606625e-07, + "loss": -0.0104, + "num_tokens": 25472916.0, + "reward": 0.26250001788139343, + "reward_std": 0.09185586869716644, + "rewards/correctness_reward_func/mean": 0.0, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.26250001788139343, + "rewards/format_reward_func/std": 0.09323723614215851, + "step": 738 + }, + { + "completion_length": 1810.4166870117188, + "completions/clipped_ratio": 0.16666666666666663, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5111.0, + "completions/mean_length": 2908.58349609375, + "completions/mean_terminated_length": 2172.5, + "completions/min_length": 804.0, + "completions/min_terminated_length": 804.0, + "epoch": 0.2506784260515604, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.8079109191894531, + "kl": NaN, + "learning_rate": 3.812974465148378e-07, + "loss": -0.0606, + "num_tokens": 25503497.0, + "reward": 0.5166666507720947, + "reward_std": 0.2840188145637512, + "rewards/correctness_reward_func/mean": 0.2666666805744171, + "rewards/correctness_reward_func/std": 0.393892765045166, + "rewards/format_reward_func/mean": 0.25, + "rewards/format_reward_func/std": 0.11677484214305878, + "step": 739 + }, + { + "completion_length": 861.6666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1548.0, + "completions/max_terminated_length": 1548.0, + "completions/mean_length": 861.6666870117188, + "completions/mean_terminated_length": 861.6666870117188, + "completions/min_length": 534.0, + "completions/min_terminated_length": 534.0, + "epoch": 0.2510176390773406, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.0568649006236228e-07, + "kl": 0.0, + "learning_rate": 3.811249137336094e-07, + "loss": 0.0, + "num_tokens": 25525681.0, + "reward": 1.1000001430511475, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 740 + }, + { + "completion_length": 2058.916748046875, + "completions/clipped_ratio": 0.33333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 6243.0, + "completions/mean_length": 4255.25, + "completions/mean_terminated_length": 3088.375, + "completions/min_length": 1242.0, + "completions/min_terminated_length": 1242.0, + "epoch": 0.25135685210312075, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2331775426864624, + "kl": NaN, + "learning_rate": 3.809523809523809e-07, + "loss": -0.1065, + "num_tokens": 25565538.0, + "reward": 0.6666666865348816, + "reward_std": 0.4954916834831238, + "rewards/correctness_reward_func/mean": 0.46666669845581055, + "rewards/correctness_reward_func/std": 0.4119429290294647, + "rewards/format_reward_func/mean": 0.20000000298023224, + "rewards/format_reward_func/std": 0.14770980179309845, + "step": 741 + }, + { + "completion_length": 1583.0000610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2655.0, + "completions/max_terminated_length": 2655.0, + "completions/mean_length": 1583.0, + "completions/mean_terminated_length": 1583.0, + "completions/min_length": 746.0, + "completions/min_terminated_length": 746.0, + "epoch": 0.25169606512890097, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.3111180067062378, + "kl": 0.0, + "learning_rate": 3.807798481711525e-07, + "loss": -0.0014, + "num_tokens": 25600728.0, + "reward": 1.133333444595337, + "reward_std": 0.1632993221282959, + "rewards/correctness_reward_func/mean": 0.8333333134651184, + "rewards/correctness_reward_func/std": 0.2806917726993561, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 742 + }, + { + "completion_length": 2449.8333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4081.0, + "completions/max_terminated_length": 4081.0, + "completions/mean_length": 2449.83349609375, + "completions/mean_terminated_length": 2449.83349609375, + "completions/min_length": 670.0, + "completions/min_terminated_length": 670.0, + "epoch": 0.25203527815468113, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.9365247655732674e-07, + "kl": 0.0, + "learning_rate": 3.8060731538992407e-07, + "loss": 0.0, + "num_tokens": 25642138.0, + "reward": 1.2000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 743 + }, + { + "completion_length": 1862.6666870117188, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5477.0, + "completions/mean_length": 2411.75, + "completions/mean_terminated_length": 2032.0, + "completions/min_length": 1016.0, + "completions/min_terminated_length": 1016.0, + "epoch": 0.25237449118046135, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.5083035826683044, + "kl": NaN, + "learning_rate": 3.8043478260869567e-07, + "loss": -0.0015, + "num_tokens": 25674756.0, + "reward": 0.9416667819023132, + "reward_std": 0.24983328580856323, + "rewards/correctness_reward_func/mean": 0.6666666865348816, + "rewards/correctness_reward_func/std": 0.31139957904815674, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.08660254627466202, + "step": 744 + }, + { + "completion_length": 2217.5001220703125, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5314.0, + "completions/mean_length": 2766.58349609375, + "completions/mean_terminated_length": 2419.091064453125, + "completions/min_length": 660.0, + "completions/min_terminated_length": 660.0, + "epoch": 0.2527137042062415, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.5263506174087524, + "kl": NaN, + "learning_rate": 3.8026224982746717e-07, + "loss": -0.0127, + "num_tokens": 25710984.0, + "reward": 0.8875001668930054, + "reward_std": 0.23438750207424164, + "rewards/correctness_reward_func/mean": 0.5999999642372131, + "rewards/correctness_reward_func/std": 0.36181363463401794, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 745 + }, + { + "completion_length": 2103.3334350585938, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 4934.0, + "completions/mean_length": 2652.416748046875, + "completions/mean_terminated_length": 2294.54541015625, + "completions/min_length": 779.0, + "completions/min_terminated_length": 779.0, + "epoch": 0.2530529172320217, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.532516360282898, + "kl": NaN, + "learning_rate": 3.800897170462388e-07, + "loss": -0.0576, + "num_tokens": 25749766.0, + "reward": 0.8083333969116211, + "reward_std": 0.23327383399009705, + "rewards/correctness_reward_func/mean": 0.5333333611488342, + "rewards/correctness_reward_func/std": 0.3938928246498108, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.08660254627466202, + "step": 746 + }, + { + "completion_length": 2240.8333740234375, + "completions/clipped_ratio": 0.33333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 6402.0, + "completions/mean_length": 4437.1669921875, + "completions/mean_terminated_length": 3361.25, + "completions/min_length": 1404.0, + "completions/min_terminated_length": 1404.0, + "epoch": 0.2533921302578019, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7276754379272461, + "kl": NaN, + "learning_rate": 3.7991718426501033e-07, + "loss": -0.0838, + "num_tokens": 25790936.0, + "reward": 0.6666666865348816, + "reward_std": 0.4954916834831238, + "rewards/correctness_reward_func/mean": 0.46666669845581055, + "rewards/correctness_reward_func/std": 0.4119429290294647, + "rewards/format_reward_func/mean": 0.20000000298023224, + "rewards/format_reward_func/std": 0.14770980179309845, + "step": 747 + }, + { + "completion_length": 935.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1434.0, + "completions/max_terminated_length": 1434.0, + "completions/mean_length": 935.75, + "completions/mean_terminated_length": 935.75, + "completions/min_length": 518.0, + "completions/min_terminated_length": 518.0, + "epoch": 0.2537313432835821, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.218993568538281e-08, + "kl": 0.0, + "learning_rate": 3.797446514837819e-07, + "loss": 0.0, + "num_tokens": 25813547.0, + "reward": 0.7000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 748 + }, + { + "completion_length": 1297.8333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3254.0, + "completions/max_terminated_length": 3254.0, + "completions/mean_length": 1297.8333740234375, + "completions/mean_terminated_length": 1297.8333740234375, + "completions/min_length": 514.0, + "completions/min_terminated_length": 514.0, + "epoch": 0.25407055630936226, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.04760192334651947, + "kl": 0.0, + "learning_rate": 3.7957211870255344e-07, + "loss": 0.0017, + "num_tokens": 25845291.0, + "reward": 1.1166667938232422, + "reward_std": 0.040824826806783676, + "rewards/correctness_reward_func/mean": 0.8166666626930237, + "rewards/correctness_reward_func/std": 0.05773502215743065, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 749 + }, + { + "completion_length": 3342.08349609375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5745.0, + "completions/max_terminated_length": 5745.0, + "completions/mean_length": 3342.08349609375, + "completions/mean_terminated_length": 3342.08349609375, + "completions/min_length": 986.0, + "completions/min_terminated_length": 986.0, + "epoch": 0.2544097693351425, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.1283939778804779, + "kl": 0.0, + "learning_rate": 3.7939958592132504e-07, + "loss": 0.0042, + "num_tokens": 25900642.0, + "reward": 0.7833333015441895, + "reward_std": 0.040824808180332184, + "rewards/correctness_reward_func/mean": 0.4833333492279053, + "rewards/correctness_reward_func/std": 0.5078176856040955, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 750 + }, + { + "completion_length": 921.5000305175781, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1709.0, + "completions/max_terminated_length": 1709.0, + "completions/mean_length": 921.5, + "completions/mean_terminated_length": 921.5, + "completions/min_length": 322.0, + "completions/min_terminated_length": 322.0, + "epoch": 0.25474898236092264, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 3.792270531400966e-07, + "loss": 0.0, + "num_tokens": 25921246.0, + "reward": 0.7999999523162842, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.5, + "rewards/correctness_reward_func/std": 0.5222329497337341, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 751 + }, + { + "completion_length": 2476.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3929.0, + "completions/max_terminated_length": 3929.0, + "completions/mean_length": 2476.5, + "completions/mean_terminated_length": 2476.5, + "completions/min_length": 1234.0, + "completions/min_terminated_length": 1234.0, + "epoch": 0.25508819538670285, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14889118075370789, + "kl": 0.0, + "learning_rate": 3.7905452035886815e-07, + "loss": -0.0032, + "num_tokens": 25963354.0, + "reward": 1.2333333492279053, + "reward_std": 0.10327950119972229, + "rewards/correctness_reward_func/mean": 0.9333333373069763, + "rewards/correctness_reward_func/std": 0.0984731912612915, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 752 + }, + { + "completion_length": 1880.3333740234375, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 4136.0, + "completions/mean_length": 2429.416748046875, + "completions/mean_terminated_length": 2051.272705078125, + "completions/min_length": 939.0, + "completions/min_terminated_length": 939.0, + "epoch": 0.255427408412483, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7829989194869995, + "kl": NaN, + "learning_rate": 3.7888198757763975e-07, + "loss": -0.0045, + "num_tokens": 26000126.0, + "reward": 0.9750000834465027, + "reward_std": 0.36095842719078064, + "rewards/correctness_reward_func/mean": 0.7000000476837158, + "rewards/correctness_reward_func/std": 0.43064433336257935, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.08660254627466202, + "step": 753 + }, + { + "completion_length": 1247.4166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4548.0, + "completions/max_terminated_length": 4548.0, + "completions/mean_length": 1247.416748046875, + "completions/mean_terminated_length": 1247.416748046875, + "completions/min_length": 488.0, + "completions/min_terminated_length": 488.0, + "epoch": 0.25576662143826323, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.129471994294363e-07, + "kl": 0.0, + "learning_rate": 3.787094547964113e-07, + "loss": 0.0, + "num_tokens": 26026249.0, + "reward": 1.1000001430511475, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 754 + }, + { + "completion_length": 1251.9166870117188, + "completions/clipped_ratio": 0.16666666666666663, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 4579.0, + "completions/mean_length": 2350.08349609375, + "completions/mean_terminated_length": 1502.300048828125, + "completions/min_length": 654.0, + "completions/min_terminated_length": 654.0, + "epoch": 0.25610583446404345, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.3859163224697113, + "kl": NaN, + "learning_rate": 3.785369220151829e-07, + "loss": -0.0551, + "num_tokens": 26054394.0, + "reward": 1.0833332538604736, + "reward_std": 0.3356585204601288, + "rewards/correctness_reward_func/mean": 0.8333333134651184, + "rewards/correctness_reward_func/std": 0.3892494738101959, + "rewards/format_reward_func/mean": 0.25, + "rewards/format_reward_func/std": 0.11677484214305878, + "step": 755 + }, + { + "completion_length": 1971.9166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5072.0, + "completions/max_terminated_length": 5072.0, + "completions/mean_length": 1971.916748046875, + "completions/mean_terminated_length": 1971.916748046875, + "completions/min_length": 738.0, + "completions/min_terminated_length": 738.0, + "epoch": 0.2564450474898236, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.13238492608070374, + "kl": 0.0, + "learning_rate": 3.783643892339544e-07, + "loss": -0.0047, + "num_tokens": 26091593.0, + "reward": 0.7333333492279053, + "reward_std": 0.051639772951602936, + "rewards/correctness_reward_func/mean": 0.43333330750465393, + "rewards/correctness_reward_func/std": 0.45792683959007263, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 756 + }, + { + "completion_length": 1132.1666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2217.0, + "completions/max_terminated_length": 2217.0, + "completions/mean_length": 1132.166748046875, + "completions/mean_terminated_length": 1132.166748046875, + "completions/min_length": 467.0, + "completions/min_terminated_length": 467.0, + "epoch": 0.2567842605156038, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.04667339101433754, + "kl": 0.0, + "learning_rate": 3.78191856452726e-07, + "loss": -0.0002, + "num_tokens": 26114833.0, + "reward": 0.7166666984558105, + "reward_std": 0.040824826806783676, + "rewards/correctness_reward_func/mean": 0.4166666567325592, + "rewards/correctness_reward_func/std": 0.4386619031429291, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 757 + }, + { + "completion_length": 3478.0833740234375, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5985.0, + "completions/mean_length": 4027.166748046875, + "completions/mean_terminated_length": 3794.27294921875, + "completions/min_length": 2198.0, + "completions/min_terminated_length": 2198.0, + "epoch": 0.257123473541384, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.39453667402267456, + "kl": NaN, + "learning_rate": 3.7801932367149757e-07, + "loss": -0.0182, + "num_tokens": 26167190.0, + "reward": 0.7416667938232422, + "reward_std": 0.1855173110961914, + "rewards/correctness_reward_func/mean": 0.46666669845581055, + "rewards/correctness_reward_func/std": 0.4119429290294647, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.08660254627466202, + "step": 758 + }, + { + "completion_length": 1258.3333435058594, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2983.0, + "completions/max_terminated_length": 2983.0, + "completions/mean_length": 1258.3333740234375, + "completions/mean_terminated_length": 1258.3333740234375, + "completions/min_length": 442.0, + "completions/min_terminated_length": 442.0, + "epoch": 0.2574626865671642, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.669351519240081e-07, + "kl": 0.0, + "learning_rate": 3.778467908902691e-07, + "loss": 0.0, + "num_tokens": 26195550.0, + "reward": 1.1000001430511475, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 759 + }, + { + "completion_length": 708.9166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1381.0, + "completions/max_terminated_length": 1381.0, + "completions/mean_length": 708.9166870117188, + "completions/mean_terminated_length": 708.9166870117188, + "completions/min_length": 423.0, + "completions/min_terminated_length": 423.0, + "epoch": 0.25780189959294436, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.04684539511799812, + "kl": 0.0, + "learning_rate": 3.776742581090407e-07, + "loss": -0.0, + "num_tokens": 26213207.0, + "reward": 0.7875000834465027, + "reward_std": 0.03061862848699093, + "rewards/correctness_reward_func/mean": 0.5, + "rewards/correctness_reward_func/std": 0.5222329497337341, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 760 + }, + { + "completion_length": 869.0000305175781, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2138.0, + "completions/max_terminated_length": 2138.0, + "completions/mean_length": 869.0, + "completions/mean_terminated_length": 869.0, + "completions/min_length": 444.0, + "completions/min_terminated_length": 444.0, + "epoch": 0.2581411126187246, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.07464686036109924, + "kl": 0.0, + "learning_rate": 3.775017253278123e-07, + "loss": 0.0001, + "num_tokens": 26235623.0, + "reward": 1.2833333015441895, + "reward_std": 0.040824808180332184, + "rewards/correctness_reward_func/mean": 0.9833333492279053, + "rewards/correctness_reward_func/std": 0.05773502215743065, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 761 + }, + { + "completion_length": 1791.666748046875, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5793.0, + "completions/mean_length": 2340.75, + "completions/mean_terminated_length": 1954.5455322265625, + "completions/min_length": 737.0, + "completions/min_terminated_length": 737.0, + "epoch": 0.25848032564450474, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.49990057945251465, + "kl": NaN, + "learning_rate": 3.7732919254658383e-07, + "loss": -0.0433, + "num_tokens": 26269273.0, + "reward": 1.0250000953674316, + "reward_std": 0.2906580865383148, + "rewards/correctness_reward_func/mean": 0.75, + "rewards/correctness_reward_func/std": 0.36306774616241455, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.08660254627466202, + "step": 762 + }, + { + "completion_length": 1257.666748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2279.0, + "completions/max_terminated_length": 2279.0, + "completions/mean_length": 1257.666748046875, + "completions/mean_terminated_length": 1257.666748046875, + "completions/min_length": 546.0, + "completions/min_terminated_length": 546.0, + "epoch": 0.25881953867028495, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.947257240128238e-08, + "kl": 0.0, + "learning_rate": 3.771566597653554e-07, + "loss": 0.0, + "num_tokens": 26291937.0, + "reward": 0.7000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 763 + }, + { + "completion_length": 2138.5, + "completions/clipped_ratio": 0.25, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 4321.0, + "completions/mean_length": 3785.75, + "completions/mean_terminated_length": 2851.333251953125, + "completions/min_length": 1372.0, + "completions/min_terminated_length": 1372.0, + "epoch": 0.2591587516960651, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0325132608413696, + "kl": NaN, + "learning_rate": 3.7698412698412694e-07, + "loss": -0.0504, + "num_tokens": 26331945.0, + "reward": 0.7416666746139526, + "reward_std": 0.4976527690887451, + "rewards/correctness_reward_func/mean": 0.5166666507720947, + "rewards/correctness_reward_func/std": 0.4628632962703705, + "rewards/format_reward_func/mean": 0.22500000894069672, + "rewards/format_reward_func/std": 0.13568010926246643, + "step": 764 + }, + { + "completion_length": 847.9166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1307.0, + "completions/max_terminated_length": 1307.0, + "completions/mean_length": 847.9166870117188, + "completions/mean_terminated_length": 847.9166870117188, + "completions/min_length": 562.0, + "completions/min_terminated_length": 562.0, + "epoch": 0.25949796472184533, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.0442206388461273e-07, + "kl": 0.0, + "learning_rate": 3.7681159420289855e-07, + "loss": 0.0, + "num_tokens": 26353748.0, + "reward": 1.2000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 765 + }, + { + "completion_length": 719.1666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1199.0, + "completions/max_terminated_length": 1199.0, + "completions/mean_length": 719.1666870117188, + "completions/mean_terminated_length": 719.1666870117188, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "epoch": 0.2598371777476255, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.2788970172405243, + "kl": 0.0, + "learning_rate": 3.766390614216701e-07, + "loss": -0.0009, + "num_tokens": 26376988.0, + "reward": 0.6708333492279053, + "reward_std": 0.22383961081504822, + "rewards/correctness_reward_func/mean": 0.38333332538604736, + "rewards/correctness_reward_func/std": 0.4783177673816681, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 766 + }, + { + "completion_length": 1171.916748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1726.0, + "completions/max_terminated_length": 1726.0, + "completions/mean_length": 1171.916748046875, + "completions/mean_terminated_length": 1171.916748046875, + "completions/min_length": 713.0, + "completions/min_terminated_length": 713.0, + "epoch": 0.2601763907734057, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.0947449286268238e-07, + "kl": 0.0, + "learning_rate": 3.7646652864044165e-07, + "loss": 0.0, + "num_tokens": 26401281.0, + "reward": 1.1000001430511475, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 767 + }, + { + "completion_length": 2836.166748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5037.0, + "completions/max_terminated_length": 5037.0, + "completions/mean_length": 2836.166748046875, + "completions/mean_terminated_length": 2836.166748046875, + "completions/min_length": 1051.0, + "completions/min_terminated_length": 1051.0, + "epoch": 0.26051560379918587, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 3.7629399585921326e-07, + "loss": 0.0, + "num_tokens": 26448257.0, + "reward": 0.7999999523162842, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.5, + "rewards/correctness_reward_func/std": 0.5222329497337341, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 768 + }, + { + "completion_length": 1207.4166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2167.0, + "completions/max_terminated_length": 2167.0, + "completions/mean_length": 1207.416748046875, + "completions/mean_terminated_length": 1207.416748046875, + "completions/min_length": 774.0, + "completions/min_terminated_length": 774.0, + "epoch": 0.2608548168249661, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.06325611472129822, + "kl": 0.0, + "learning_rate": 3.761214630779848e-07, + "loss": 0.0005, + "num_tokens": 26474326.0, + "reward": 1.2166666984558105, + "reward_std": 0.040824826806783676, + "rewards/correctness_reward_func/mean": 0.9166666865348816, + "rewards/correctness_reward_func/std": 0.10298573225736618, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 769 + }, + { + "completion_length": 2223.1666870117188, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5250.0, + "completions/mean_length": 2772.25, + "completions/mean_terminated_length": 2425.272705078125, + "completions/min_length": 816.0, + "completions/min_terminated_length": 816.0, + "epoch": 0.26119402985074625, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.499055415391922, + "kl": NaN, + "learning_rate": 3.7594893029675636e-07, + "loss": -0.0143, + "num_tokens": 26511738.0, + "reward": 0.9083334803581238, + "reward_std": 0.2905454635620117, + "rewards/correctness_reward_func/mean": 0.6333333253860474, + "rewards/correctness_reward_func/std": 0.3892494738101959, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.08660253882408142, + "step": 770 + }, + { + "completion_length": 2154.8334350585938, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5527.0, + "completions/max_terminated_length": 5527.0, + "completions/mean_length": 2154.83349609375, + "completions/mean_terminated_length": 2154.83349609375, + "completions/min_length": 946.0, + "completions/min_terminated_length": 946.0, + "epoch": 0.26153324287652646, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16773566603660583, + "kl": 0.0, + "learning_rate": 3.757763975155279e-07, + "loss": 0.0018, + "num_tokens": 26548642.0, + "reward": 1.183333396911621, + "reward_std": 0.09246458858251572, + "rewards/correctness_reward_func/mean": 0.8833333849906921, + "rewards/correctness_reward_func/std": 0.10298572480678558, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 771 + }, + { + "completion_length": 502.16668701171875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 850.0, + "completions/max_terminated_length": 850.0, + "completions/mean_length": 502.16668701171875, + "completions/mean_terminated_length": 502.16668701171875, + "completions/min_length": 353.0, + "completions/min_terminated_length": 353.0, + "epoch": 0.2618724559023066, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 3.756038647342995e-07, + "loss": 0.0, + "num_tokens": 26566974.0, + "reward": 1.2999999523162842, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 1.0, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 772 + }, + { + "completion_length": 684.4166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1408.0, + "completions/max_terminated_length": 1408.0, + "completions/mean_length": 684.4166870117188, + "completions/mean_terminated_length": 684.4166870117188, + "completions/min_length": 403.0, + "completions/min_terminated_length": 403.0, + "epoch": 0.26221166892808684, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.07578389346599579, + "kl": 0.0, + "learning_rate": 3.754313319530711e-07, + "loss": -0.0004, + "num_tokens": 26585555.0, + "reward": 1.25, + "reward_std": 0.05477223917841911, + "rewards/correctness_reward_func/mean": 0.949999988079071, + "rewards/correctness_reward_func/std": 0.09045339375734329, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 773 + }, + { + "completion_length": 1621.8333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3594.0, + "completions/max_terminated_length": 3594.0, + "completions/mean_length": 1621.8333740234375, + "completions/mean_terminated_length": 1621.8333740234375, + "completions/min_length": 887.0, + "completions/min_terminated_length": 887.0, + "epoch": 0.26255088195386705, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.3733241505397018e-07, + "kl": 0.0, + "learning_rate": 3.7525879917184263e-07, + "loss": 0.0, + "num_tokens": 26615775.0, + "reward": 1.1000001430511475, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 774 + }, + { + "completion_length": 1442.166748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4935.0, + "completions/max_terminated_length": 4935.0, + "completions/mean_length": 1442.166748046875, + "completions/mean_terminated_length": 1442.166748046875, + "completions/min_length": 592.0, + "completions/min_terminated_length": 592.0, + "epoch": 0.2628900949796472, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.1121632531285286, + "kl": 0.0, + "learning_rate": 3.750862663906142e-07, + "loss": -0.0074, + "num_tokens": 26647673.0, + "reward": 0.7166666984558105, + "reward_std": 0.040824826806783676, + "rewards/correctness_reward_func/mean": 0.4166666567325592, + "rewards/correctness_reward_func/std": 0.4386619031429291, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 775 + }, + { + "completion_length": 520.5000152587891, + "completions/clipped_ratio": 0.0, + "completions/max_length": 897.0, + "completions/max_terminated_length": 897.0, + "completions/mean_length": 520.5, + "completions/mean_terminated_length": 520.5, + "completions/min_length": 358.0, + "completions/min_terminated_length": 358.0, + "epoch": 0.26322930800542743, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.2791745960712433, + "kl": 0.0, + "learning_rate": 3.749137336093858e-07, + "loss": -0.0005, + "num_tokens": 26665733.0, + "reward": 1.1041667461395264, + "reward_std": 0.2002602219581604, + "rewards/correctness_reward_func/mean": 0.8166666030883789, + "rewards/correctness_reward_func/std": 0.27579087018966675, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 776 + }, + { + "completion_length": 702.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1141.0, + "completions/max_terminated_length": 1141.0, + "completions/mean_length": 702.0, + "completions/mean_terminated_length": 702.0, + "completions/min_length": 494.0, + "completions/min_terminated_length": 494.0, + "epoch": 0.2635685210312076, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 3.7474120082815734e-07, + "loss": 0.0, + "num_tokens": 26684873.0, + "reward": 1.2999999523162842, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 1.0, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 777 + }, + { + "completion_length": 1498.7500610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1975.0, + "completions/max_terminated_length": 1975.0, + "completions/mean_length": 1498.75, + "completions/mean_terminated_length": 1498.75, + "completions/min_length": 1073.0, + "completions/min_terminated_length": 1073.0, + "epoch": 0.2639077340569878, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 3.745686680469289e-07, + "loss": 0.0, + "num_tokens": 26715134.0, + "reward": 0.7999999523162842, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.5, + "rewards/correctness_reward_func/std": 0.5222329497337341, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 778 + }, + { + "completion_length": 1654.166748046875, + "completions/clipped_ratio": 0.25, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5809.0, + "completions/mean_length": 3301.416748046875, + "completions/mean_terminated_length": 2205.5556640625, + "completions/min_length": 488.0, + "completions/min_terminated_length": 488.0, + "epoch": 0.26424694708276797, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.9202248454093933, + "kl": NaN, + "learning_rate": 3.7439613526570044e-07, + "loss": -0.0862, + "num_tokens": 26745628.0, + "reward": 0.8583333492279053, + "reward_std": 0.27095508575439453, + "rewards/correctness_reward_func/mean": 0.6333333253860474, + "rewards/correctness_reward_func/std": 0.4735424220561981, + "rewards/format_reward_func/mean": 0.22500000894069672, + "rewards/format_reward_func/std": 0.13568010926246643, + "step": 779 + }, + { + "completion_length": 1470.7500610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3567.0, + "completions/max_terminated_length": 3567.0, + "completions/mean_length": 1470.75, + "completions/mean_terminated_length": 1470.75, + "completions/min_length": 360.0, + "completions/min_terminated_length": 360.0, + "epoch": 0.2645861601085482, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.10494286566972733, + "kl": 0.0, + "learning_rate": 3.7422360248447205e-07, + "loss": -0.0021, + "num_tokens": 26774809.0, + "reward": 1.2666666507720947, + "reward_std": 0.05163975805044174, + "rewards/correctness_reward_func/mean": 0.9666666984558105, + "rewards/correctness_reward_func/std": 0.07784988731145859, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 780 + }, + { + "completion_length": 2154.166748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4803.0, + "completions/max_terminated_length": 4803.0, + "completions/mean_length": 2154.166748046875, + "completions/mean_terminated_length": 2154.166748046875, + "completions/min_length": 835.0, + "completions/min_terminated_length": 835.0, + "epoch": 0.26492537313432835, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 3.740510697032436e-07, + "loss": 0.0, + "num_tokens": 26810025.0, + "reward": 0.7999999523162842, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.5, + "rewards/correctness_reward_func/std": 0.5222329497337341, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 781 + }, + { + "completion_length": 1063.2500610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1335.0, + "completions/max_terminated_length": 1335.0, + "completions/mean_length": 1063.25, + "completions/mean_terminated_length": 1063.25, + "completions/min_length": 722.0, + "completions/min_terminated_length": 722.0, + "epoch": 0.26526458616010856, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.11350041627883911, + "kl": 0.0, + "learning_rate": 3.7387853692201516e-07, + "loss": -0.0009, + "num_tokens": 26833140.0, + "reward": 1.25, + "reward_std": 0.05477223917841911, + "rewards/correctness_reward_func/mean": 0.949999988079071, + "rewards/correctness_reward_func/std": 0.09045339375734329, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 782 + }, + { + "completion_length": 1292.5833740234375, + "completions/clipped_ratio": 0.33333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 6559.0, + "completions/mean_length": 3488.916748046875, + "completions/mean_terminated_length": 1938.875, + "completions/min_length": 386.0, + "completions/min_terminated_length": 386.0, + "epoch": 0.2656037991858887, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5954703092575073, + "kl": NaN, + "learning_rate": 3.737060041407867e-07, + "loss": -0.1164, + "num_tokens": 26860345.0, + "reward": 0.7333334684371948, + "reward_std": 0.2840188145637512, + "rewards/correctness_reward_func/mean": 0.5333333611488342, + "rewards/correctness_reward_func/std": 0.3938928246498108, + "rewards/format_reward_func/mean": 0.20000000298023224, + "rewards/format_reward_func/std": 0.14770980179309845, + "step": 783 + }, + { + "completion_length": 2248.9166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4318.0, + "completions/max_terminated_length": 4318.0, + "completions/mean_length": 2248.916748046875, + "completions/mean_terminated_length": 2248.916748046875, + "completions/min_length": 808.0, + "completions/min_terminated_length": 808.0, + "epoch": 0.26594301221166894, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.35711029171943665, + "kl": 0.0, + "learning_rate": 3.735334713595583e-07, + "loss": -0.0026, + "num_tokens": 26896800.0, + "reward": 0.7666667699813843, + "reward_std": 0.1632993221282959, + "rewards/correctness_reward_func/mean": 0.46666669845581055, + "rewards/correctness_reward_func/std": 0.4119429290294647, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 784 + }, + { + "completion_length": 1661.5833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2417.0, + "completions/max_terminated_length": 2417.0, + "completions/mean_length": 1661.5833740234375, + "completions/mean_terminated_length": 1661.5833740234375, + "completions/min_length": 810.0, + "completions/min_terminated_length": 810.0, + "epoch": 0.2662822252374491, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.15416787564754486, + "kl": 0.0, + "learning_rate": 3.7336093857832987e-07, + "loss": -0.0007, + "num_tokens": 26927623.0, + "reward": 1.133333444595337, + "reward_std": 0.05163976177573204, + "rewards/correctness_reward_func/mean": 0.8333333134651184, + "rewards/correctness_reward_func/std": 0.07784988731145859, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 785 + }, + { + "completion_length": 779.7500305175781, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1260.0, + "completions/max_terminated_length": 1260.0, + "completions/mean_length": 779.75, + "completions/mean_terminated_length": 779.75, + "completions/min_length": 518.0, + "completions/min_terminated_length": 518.0, + "epoch": 0.2666214382632293, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.07597201317548752, + "kl": 0.0, + "learning_rate": 3.731884057971014e-07, + "loss": 0.0004, + "num_tokens": 26947360.0, + "reward": 1.133333444595337, + "reward_std": 0.051639772951602936, + "rewards/correctness_reward_func/mean": 0.8333333134651184, + "rewards/correctness_reward_func/std": 0.07784988731145859, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 786 + }, + { + "completion_length": 2350.8333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4980.0, + "completions/max_terminated_length": 4980.0, + "completions/mean_length": 2350.83349609375, + "completions/mean_terminated_length": 2350.83349609375, + "completions/min_length": 1081.0, + "completions/min_terminated_length": 1081.0, + "epoch": 0.2669606512890095, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6928236484527588, + "kl": 0.0, + "learning_rate": 3.73015873015873e-07, + "loss": 0.0143, + "num_tokens": 26985722.0, + "reward": 1.070833444595337, + "reward_std": 0.2486901879310608, + "rewards/correctness_reward_func/mean": 0.7833333015441895, + "rewards/correctness_reward_func/std": 0.26227444410324097, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 787 + }, + { + "completion_length": 1690.1666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4178.0, + "completions/max_terminated_length": 4178.0, + "completions/mean_length": 1690.166748046875, + "completions/mean_terminated_length": 1690.166748046875, + "completions/min_length": 448.0, + "completions/min_terminated_length": 448.0, + "epoch": 0.2672998643147897, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.48641929030418396, + "kl": 0.0, + "learning_rate": 3.728433402346446e-07, + "loss": 0.0025, + "num_tokens": 27017704.0, + "reward": 0.4333333671092987, + "reward_std": 0.20655912160873413, + "rewards/correctness_reward_func/mean": 0.13333334028720856, + "rewards/correctness_reward_func/std": 0.3113996088504791, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 788 + }, + { + "completion_length": 721.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1439.0, + "completions/max_terminated_length": 1439.0, + "completions/mean_length": 721.75, + "completions/mean_terminated_length": 721.75, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.26763907734056985, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 3.7267080745341613e-07, + "loss": 0.0, + "num_tokens": 27044245.0, + "reward": 1.2999999523162842, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 1.0, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 789 + }, + { + "completion_length": 3101.166748046875, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 6048.0, + "completions/mean_length": 3650.25, + "completions/mean_terminated_length": 3383.091064453125, + "completions/min_length": 2335.0, + "completions/min_terminated_length": 2335.0, + "epoch": 0.26797829036635007, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.937369704246521, + "kl": NaN, + "learning_rate": 3.724982746721877e-07, + "loss": -0.0396, + "num_tokens": 27088401.0, + "reward": 0.7041667699813843, + "reward_std": 0.46168631315231323, + "rewards/correctness_reward_func/mean": 0.4166666567325592, + "rewards/correctness_reward_func/std": 0.4386619031429291, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 790 + }, + { + "completion_length": 2644.0833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6416.0, + "completions/max_terminated_length": 6416.0, + "completions/mean_length": 2644.08349609375, + "completions/mean_terminated_length": 2644.08349609375, + "completions/min_length": 737.0, + "completions/min_terminated_length": 737.0, + "epoch": 0.2683175033921303, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6890131235122681, + "kl": 0.0, + "learning_rate": 3.723257418909593e-07, + "loss": -0.0096, + "num_tokens": 27136918.0, + "reward": 0.38333332538604736, + "reward_std": 0.20412413775920868, + "rewards/correctness_reward_func/mean": 0.0833333358168602, + "rewards/correctness_reward_func/std": 0.28867512941360474, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 791 + }, + { + "completion_length": 1909.0834350585938, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3999.0, + "completions/max_terminated_length": 3999.0, + "completions/mean_length": 1909.0833740234375, + "completions/mean_terminated_length": 1909.0833740234375, + "completions/min_length": 454.0, + "completions/min_terminated_length": 454.0, + "epoch": 0.26865671641791045, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6697491407394409, + "kl": 0.0, + "learning_rate": 3.7215320910973084e-07, + "loss": 0.0026, + "num_tokens": 27169673.0, + "reward": 1.0333333015441895, + "reward_std": 0.25819891691207886, + "rewards/correctness_reward_func/mean": 0.7333333492279053, + "rewards/correctness_reward_func/std": 0.44585633277893066, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 792 + }, + { + "completion_length": 594.4166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 887.0, + "completions/max_terminated_length": 887.0, + "completions/mean_length": 594.4166870117188, + "completions/mean_terminated_length": 594.4166870117188, + "completions/min_length": 341.0, + "completions/min_terminated_length": 341.0, + "epoch": 0.26899592944369066, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.06149730086326599, + "kl": 0.0, + "learning_rate": 3.719806763285024e-07, + "loss": 0.0001, + "num_tokens": 27189796.0, + "reward": 0.7166666984558105, + "reward_std": 0.040824826806783676, + "rewards/correctness_reward_func/mean": 0.4166666567325592, + "rewards/correctness_reward_func/std": 0.4386619031429291, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 793 + }, + { + "completion_length": 974.5000610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3086.0, + "completions/max_terminated_length": 3086.0, + "completions/mean_length": 974.5, + "completions/mean_terminated_length": 974.5, + "completions/min_length": 316.0, + "completions/min_terminated_length": 316.0, + "epoch": 0.2693351424694708, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.1048205667284492e-07, + "kl": 0.0, + "learning_rate": 3.7180814354727395e-07, + "loss": 0.0, + "num_tokens": 27212596.0, + "reward": 1.2000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 794 + }, + { + "completion_length": 2349.83349609375, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 4858.0, + "completions/mean_length": 2898.916748046875, + "completions/mean_terminated_length": 2563.45458984375, + "completions/min_length": 1054.0, + "completions/min_terminated_length": 1054.0, + "epoch": 0.26967435549525104, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6481123566627502, + "kl": NaN, + "learning_rate": 3.7163561076604555e-07, + "loss": -0.0263, + "num_tokens": 27249230.0, + "reward": 0.770833432674408, + "reward_std": 0.2123773992061615, + "rewards/correctness_reward_func/mean": 0.4833333194255829, + "rewards/correctness_reward_func/std": 0.43029236793518066, + "rewards/format_reward_func/mean": 0.2875000238418579, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 795 + }, + { + "completion_length": 1115.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2305.0, + "completions/max_terminated_length": 2305.0, + "completions/mean_length": 1115.75, + "completions/mean_terminated_length": 1115.75, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, + "epoch": 0.2700135685210312, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08609779179096222, + "kl": 0.0, + "learning_rate": 3.7146307798481705e-07, + "loss": 0.0008, + "num_tokens": 27273443.0, + "reward": 1.2708332538604736, + "reward_std": 0.07144343107938766, + "rewards/correctness_reward_func/mean": 0.9833333492279053, + "rewards/correctness_reward_func/std": 0.05773502215743065, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 796 + }, + { + "completion_length": 1959.4166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3710.0, + "completions/max_terminated_length": 3710.0, + "completions/mean_length": 1959.416748046875, + "completions/mean_terminated_length": 1959.416748046875, + "completions/min_length": 1178.0, + "completions/min_terminated_length": 1178.0, + "epoch": 0.2703527815468114, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7615066766738892, + "kl": 0.0, + "learning_rate": 3.7129054520358866e-07, + "loss": 0.0246, + "num_tokens": 27307120.0, + "reward": 0.7666666507720947, + "reward_std": 0.4647580087184906, + "rewards/correctness_reward_func/mean": 0.46666666865348816, + "rewards/correctness_reward_func/std": 0.4923659563064575, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 797 + }, + { + "completion_length": 858.9166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1442.0, + "completions/max_terminated_length": 1442.0, + "completions/mean_length": 858.9166870117188, + "completions/mean_terminated_length": 858.9166870117188, + "completions/min_length": 540.0, + "completions/min_terminated_length": 540.0, + "epoch": 0.2706919945725916, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.3698106110095978, + "kl": 0.0, + "learning_rate": 3.711180124223602e-07, + "loss": 0.0015, + "num_tokens": 27332229.0, + "reward": 1.0208334922790527, + "reward_std": 0.19391795992851257, + "rewards/correctness_reward_func/mean": 0.7333333492279053, + "rewards/correctness_reward_func/std": 0.23094011843204498, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 798 + }, + { + "completion_length": 2415.666748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4826.0, + "completions/max_terminated_length": 4826.0, + "completions/mean_length": 2415.666748046875, + "completions/mean_terminated_length": 2415.666748046875, + "completions/min_length": 793.0, + "completions/min_terminated_length": 793.0, + "epoch": 0.2710312075983718, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6135579347610474, + "kl": 0.0, + "learning_rate": 3.709454796411318e-07, + "loss": 0.0115, + "num_tokens": 27375395.0, + "reward": 1.1708333492279053, + "reward_std": 0.22383961081504822, + "rewards/correctness_reward_func/mean": 0.8833333849906921, + "rewards/correctness_reward_func/std": 0.28867512941360474, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 799 + }, + { + "completion_length": 1509.916748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4804.0, + "completions/max_terminated_length": 4804.0, + "completions/mean_length": 1509.916748046875, + "completions/mean_terminated_length": 1509.916748046875, + "completions/min_length": 477.0, + "completions/min_terminated_length": 477.0, + "epoch": 0.27137042062415195, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.8519099853619991e-07, + "kl": 0.0, + "learning_rate": 3.7077294685990337e-07, + "loss": 0.0, + "num_tokens": 27403858.0, + "reward": 1.1000001430511475, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 800 + }, + { + "completion_length": 1219.1666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2203.0, + "completions/max_terminated_length": 2203.0, + "completions/mean_length": 1219.166748046875, + "completions/mean_terminated_length": 1219.166748046875, + "completions/min_length": 462.0, + "completions/min_terminated_length": 462.0, + "epoch": 0.27170963364993217, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.10737733542919159, + "kl": 0.0, + "learning_rate": 3.706004140786749e-07, + "loss": 0.0009, + "num_tokens": 27433344.0, + "reward": 1.1500000953674316, + "reward_std": 0.05477223917841911, + "rewards/correctness_reward_func/mean": 0.8500000834465027, + "rewards/correctness_reward_func/std": 0.09045339375734329, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 801 + }, + { + "completion_length": 606.6666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1109.0, + "completions/max_terminated_length": 1109.0, + "completions/mean_length": 606.6666870117188, + "completions/mean_terminated_length": 606.6666870117188, + "completions/min_length": 403.0, + "completions/min_terminated_length": 403.0, + "epoch": 0.27204884667571233, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 3.7042788129744653e-07, + "loss": 0.0, + "num_tokens": 27451136.0, + "reward": 1.2999999523162842, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 1.0, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 802 + }, + { + "completion_length": 1556.5000610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3046.0, + "completions/max_terminated_length": 3046.0, + "completions/mean_length": 1556.5, + "completions/mean_terminated_length": 1556.5, + "completions/min_length": 687.0, + "completions/min_terminated_length": 687.0, + "epoch": 0.27238805970149255, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.5762483477592468, + "kl": 0.0, + "learning_rate": 3.702553485162181e-07, + "loss": 0.0254, + "num_tokens": 27480152.0, + "reward": 1.1208332777023315, + "reward_std": 0.27857524156570435, + "rewards/correctness_reward_func/mean": 0.8333333134651184, + "rewards/correctness_reward_func/std": 0.3892494738101959, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 803 + }, + { + "completion_length": 1405.0000610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2416.0, + "completions/max_terminated_length": 2416.0, + "completions/mean_length": 1405.0, + "completions/mean_terminated_length": 1405.0, + "completions/min_length": 628.0, + "completions/min_terminated_length": 628.0, + "epoch": 0.2727272727272727, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.4991465210914612, + "kl": 0.0, + "learning_rate": 3.7008281573498964e-07, + "loss": -0.0007, + "num_tokens": 27506474.0, + "reward": 0.46666666865348816, + "reward_std": 0.2581988573074341, + "rewards/correctness_reward_func/mean": 0.1666666716337204, + "rewards/correctness_reward_func/std": 0.3892494738101959, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 804 + }, + { + "completion_length": 1961.3333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3442.0, + "completions/max_terminated_length": 3442.0, + "completions/mean_length": 1961.3333740234375, + "completions/mean_terminated_length": 1961.3333740234375, + "completions/min_length": 982.0, + "completions/min_terminated_length": 982.0, + "epoch": 0.2730664857530529, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.4712648521708616e-07, + "kl": 0.0, + "learning_rate": 3.699102829537612e-07, + "loss": 0.0, + "num_tokens": 27545088.0, + "reward": 0.7000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 805 + }, + { + "completion_length": 1769.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3968.0, + "completions/max_terminated_length": 3968.0, + "completions/mean_length": 1769.0, + "completions/mean_terminated_length": 1769.0, + "completions/min_length": 532.0, + "completions/min_terminated_length": 532.0, + "epoch": 0.2734056987788331, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11471240222454071, + "kl": 0.0, + "learning_rate": 3.697377501725328e-07, + "loss": -0.0003, + "num_tokens": 27581796.0, + "reward": 1.1666667461395264, + "reward_std": 0.09559707343578339, + "rewards/correctness_reward_func/mean": 0.8666666150093079, + "rewards/correctness_reward_func/std": 0.0984731912612915, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 806 + }, + { + "completion_length": 1644.7500610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3036.0, + "completions/max_terminated_length": 3036.0, + "completions/mean_length": 1644.75, + "completions/mean_terminated_length": 1644.75, + "completions/min_length": 803.0, + "completions/min_terminated_length": 803.0, + "epoch": 0.2737449118046133, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 3.695652173913043e-07, + "loss": 0.0, + "num_tokens": 27614289.0, + "reward": 1.2999999523162842, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 1.0, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 807 + }, + { + "completion_length": 1305.0833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3489.0, + "completions/max_terminated_length": 3489.0, + "completions/mean_length": 1305.0833740234375, + "completions/mean_terminated_length": 1305.0833740234375, + "completions/min_length": 634.0, + "completions/min_terminated_length": 634.0, + "epoch": 0.27408412483039346, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.5795224905014038, + "kl": 0.0, + "learning_rate": 3.693926846100759e-07, + "loss": 0.0063, + "num_tokens": 27644182.0, + "reward": 1.0499999523162842, + "reward_std": 0.27386125922203064, + "rewards/correctness_reward_func/mean": 0.75, + "rewards/correctness_reward_func/std": 0.45226702094078064, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 808 + }, + { + "completion_length": 1074.1666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1799.0, + "completions/max_terminated_length": 1799.0, + "completions/mean_length": 1074.166748046875, + "completions/mean_terminated_length": 1074.166748046875, + "completions/min_length": 687.0, + "completions/min_terminated_length": 687.0, + "epoch": 0.2744233378561737, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 3.6922015182884745e-07, + "loss": 0.0, + "num_tokens": 27668220.0, + "reward": 0.7999999523162842, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.5, + "rewards/correctness_reward_func/std": 0.5222329497337341, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 809 + }, + { + "completion_length": 974.9167175292969, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1649.0, + "completions/max_terminated_length": 1649.0, + "completions/mean_length": 974.9166870117188, + "completions/mean_terminated_length": 974.9166870117188, + "completions/min_length": 667.0, + "completions/min_terminated_length": 667.0, + "epoch": 0.2747625508819539, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.06648023426532745, + "kl": 0.0, + "learning_rate": 3.6904761904761906e-07, + "loss": -0.0, + "num_tokens": 27695639.0, + "reward": 1.25, + "reward_std": 0.05477223917841911, + "rewards/correctness_reward_func/mean": 0.949999988079071, + "rewards/correctness_reward_func/std": 0.09045339375734329, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 810 + }, + { + "completion_length": 1228.666748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1766.0, + "completions/max_terminated_length": 1766.0, + "completions/mean_length": 1228.666748046875, + "completions/mean_terminated_length": 1228.666748046875, + "completions/min_length": 449.0, + "completions/min_terminated_length": 449.0, + "epoch": 0.27510176390773405, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3744871914386749, + "kl": 0.0, + "learning_rate": 3.6887508626639056e-07, + "loss": 0.0064, + "num_tokens": 27720211.0, + "reward": 1.1166667938232422, + "reward_std": 0.24096208810806274, + "rewards/correctness_reward_func/mean": 0.8166666030883789, + "rewards/correctness_reward_func/std": 0.27579087018966675, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 811 + }, + { + "completion_length": 1277.7500610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1874.0, + "completions/max_terminated_length": 1874.0, + "completions/mean_length": 1277.75, + "completions/mean_terminated_length": 1277.75, + "completions/min_length": 791.0, + "completions/min_terminated_length": 791.0, + "epoch": 0.27544097693351427, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.08306435495615005, + "kl": 0.0, + "learning_rate": 3.6870255348516216e-07, + "loss": 0.0017, + "num_tokens": 27745120.0, + "reward": 1.2833333015441895, + "reward_std": 0.040824808180332184, + "rewards/correctness_reward_func/mean": 0.9833333492279053, + "rewards/correctness_reward_func/std": 0.05773502215743065, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 812 + }, + { + "completion_length": 1650.9166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3301.0, + "completions/max_terminated_length": 3301.0, + "completions/mean_length": 1650.916748046875, + "completions/mean_terminated_length": 1650.916748046875, + "completions/min_length": 864.0, + "completions/min_terminated_length": 864.0, + "epoch": 0.27578018995929443, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23837661743164062, + "kl": 0.0, + "learning_rate": 3.685300207039337e-07, + "loss": 0.0016, + "num_tokens": 27777681.0, + "reward": 0.7833334803581238, + "reward_std": 0.20412415266036987, + "rewards/correctness_reward_func/mean": 0.4833333194255829, + "rewards/correctness_reward_func/std": 0.43029236793518066, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 813 + }, + { + "completion_length": 703.1666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1160.0, + "completions/max_terminated_length": 1160.0, + "completions/mean_length": 703.1666870117188, + "completions/mean_terminated_length": 703.1666870117188, + "completions/min_length": 340.0, + "completions/min_terminated_length": 340.0, + "epoch": 0.27611940298507465, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.1073731854557991, + "kl": 0.0, + "learning_rate": 3.683574879227053e-07, + "loss": -0.0005, + "num_tokens": 27798101.0, + "reward": 1.25, + "reward_std": 0.05477223917841911, + "rewards/correctness_reward_func/mean": 0.949999988079071, + "rewards/correctness_reward_func/std": 0.09045339375734329, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 814 + }, + { + "completion_length": 1182.7500305175781, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1865.0, + "completions/max_terminated_length": 1865.0, + "completions/mean_length": 1182.75, + "completions/mean_terminated_length": 1182.75, + "completions/min_length": 749.0, + "completions/min_terminated_length": 749.0, + "epoch": 0.2764586160108548, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0674026682972908, + "kl": 0.0, + "learning_rate": 3.681849551414769e-07, + "loss": 0.0014, + "num_tokens": 27818804.0, + "reward": 1.2166666984558105, + "reward_std": 0.040824826806783676, + "rewards/correctness_reward_func/mean": 0.9166666865348816, + "rewards/correctness_reward_func/std": 0.10298573225736618, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 815 + }, + { + "completion_length": 1002.5833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1656.0, + "completions/max_terminated_length": 1656.0, + "completions/mean_length": 1002.5833740234375, + "completions/mean_terminated_length": 1002.5833740234375, + "completions/min_length": 650.0, + "completions/min_terminated_length": 650.0, + "epoch": 0.276797829036635, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.705409573560246e-07, + "kl": 0.0, + "learning_rate": 3.6801242236024843e-07, + "loss": 0.0, + "num_tokens": 27847707.0, + "reward": 1.2000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 816 + }, + { + "completion_length": 2383.0000610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6308.0, + "completions/max_terminated_length": 6308.0, + "completions/mean_length": 2383.0, + "completions/mean_terminated_length": 2383.0, + "completions/min_length": 501.0, + "completions/min_terminated_length": 501.0, + "epoch": 0.2771370420624152, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 3.6783988957902003e-07, + "loss": 0.0, + "num_tokens": 27886677.0, + "reward": 0.7999999523162842, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.5, + "rewards/correctness_reward_func/std": 0.5222329497337341, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 817 + }, + { + "completion_length": 2868.666748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6046.0, + "completions/max_terminated_length": 6046.0, + "completions/mean_length": 2868.666748046875, + "completions/mean_terminated_length": 2868.666748046875, + "completions/min_length": 1894.0, + "completions/min_terminated_length": 1894.0, + "epoch": 0.2774762550881954, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.09324238449335098, + "kl": 0.0, + "learning_rate": 3.6766735679779153e-07, + "loss": -0.0024, + "num_tokens": 27936599.0, + "reward": 0.7166666984558105, + "reward_std": 0.040824826806783676, + "rewards/correctness_reward_func/mean": 0.4166666567325592, + "rewards/correctness_reward_func/std": 0.4386619031429291, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 818 + }, + { + "completion_length": 1085.0833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1940.0, + "completions/max_terminated_length": 1940.0, + "completions/mean_length": 1085.0833740234375, + "completions/mean_terminated_length": 1085.0833740234375, + "completions/min_length": 498.0, + "completions/min_terminated_length": 498.0, + "epoch": 0.27781546811397556, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.5154705579334404e-07, + "kl": 0.0, + "learning_rate": 3.6749482401656314e-07, + "loss": 0.0, + "num_tokens": 27962658.0, + "reward": 0.7000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 819 + }, + { + "completion_length": 2202.2501220703125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5019.0, + "completions/max_terminated_length": 5019.0, + "completions/mean_length": 2202.25, + "completions/mean_terminated_length": 2202.25, + "completions/min_length": 559.0, + "completions/min_terminated_length": 559.0, + "epoch": 0.2781546811397558, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5000991821289062, + "kl": 0.0, + "learning_rate": 3.673222912353347e-07, + "loss": 0.0171, + "num_tokens": 28000857.0, + "reward": 0.8208333849906921, + "reward_std": 0.23816029727458954, + "rewards/correctness_reward_func/mean": 0.5333333015441895, + "rewards/correctness_reward_func/std": 0.47736650705337524, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 820 + }, + { + "completion_length": 3302.916748046875, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 6512.0, + "completions/mean_length": 3852.0, + "completions/mean_terminated_length": 3603.181884765625, + "completions/min_length": 915.0, + "completions/min_terminated_length": 915.0, + "epoch": 0.27849389416553594, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.2379552721977234, + "kl": NaN, + "learning_rate": 3.671497584541063e-07, + "loss": -0.0582, + "num_tokens": 28052630.0, + "reward": 1.120833396911621, + "reward_std": 0.19391795992851257, + "rewards/correctness_reward_func/mean": 0.8333333134651184, + "rewards/correctness_reward_func/std": 0.28069180250167847, + "rewards/format_reward_func/mean": 0.2875000238418579, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 821 + }, + { + "completion_length": 2307.2501220703125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3861.0, + "completions/max_terminated_length": 3861.0, + "completions/mean_length": 2307.25, + "completions/mean_terminated_length": 2307.25, + "completions/min_length": 1338.0, + "completions/min_terminated_length": 1338.0, + "epoch": 0.27883310719131615, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6435486674308777, + "kl": 0.0, + "learning_rate": 3.669772256728778e-07, + "loss": 0.0243, + "num_tokens": 28093595.0, + "reward": 0.8833333849906921, + "reward_std": 0.29902368783950806, + "rewards/correctness_reward_func/mean": 0.5833333134651184, + "rewards/correctness_reward_func/std": 0.4386619031429291, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 822 + }, + { + "completion_length": 2714.2501220703125, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5474.0, + "completions/mean_length": 3263.33349609375, + "completions/mean_terminated_length": 2961.0, + "completions/min_length": 859.0, + "completions/min_terminated_length": 859.0, + "epoch": 0.2791723202170963, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.2697756886482239, + "kl": NaN, + "learning_rate": 3.668046928916494e-07, + "loss": -0.0554, + "num_tokens": 28140860.0, + "reward": 1.0416667461395264, + "reward_std": 0.2457980364561081, + "rewards/correctness_reward_func/mean": 0.7666667103767395, + "rewards/correctness_reward_func/std": 0.2534608840942383, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.08660253882408142, + "step": 823 + }, + { + "completion_length": 2003.3334350585938, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4538.0, + "completions/max_terminated_length": 4538.0, + "completions/mean_length": 2003.3333740234375, + "completions/mean_terminated_length": 2003.3333740234375, + "completions/min_length": 551.0, + "completions/min_terminated_length": 551.0, + "epoch": 0.27951153324287653, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.072422556579113, + "kl": 0.0, + "learning_rate": 3.6663216011042096e-07, + "loss": -0.0009, + "num_tokens": 28174818.0, + "reward": 0.7166666984558105, + "reward_std": 0.040824826806783676, + "rewards/correctness_reward_func/mean": 0.4166666567325592, + "rewards/correctness_reward_func/std": 0.4386619031429291, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 824 + }, + { + "completion_length": 1121.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1858.0, + "completions/max_terminated_length": 1858.0, + "completions/mean_length": 1121.5, + "completions/mean_terminated_length": 1121.5, + "completions/min_length": 850.0, + "completions/min_terminated_length": 850.0, + "epoch": 0.2798507462686567, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 3.6645962732919256e-07, + "loss": 0.0, + "num_tokens": 28197018.0, + "reward": 0.7999999523162842, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.5, + "rewards/correctness_reward_func/std": 0.5222329497337341, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 825 + }, + { + "completion_length": 1077.4166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3488.0, + "completions/max_terminated_length": 3488.0, + "completions/mean_length": 1077.416748046875, + "completions/mean_terminated_length": 1077.416748046875, + "completions/min_length": 404.0, + "completions/min_terminated_length": 404.0, + "epoch": 0.2801899592944369, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6790959239006042, + "kl": 0.0, + "learning_rate": 3.6628709454796406e-07, + "loss": 0.0398, + "num_tokens": 28224377.0, + "reward": 0.9666666984558105, + "reward_std": 0.3098386526107788, + "rewards/correctness_reward_func/mean": 0.6666666269302368, + "rewards/correctness_reward_func/std": 0.4119429290294647, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 826 + }, + { + "completion_length": 690.6666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1055.0, + "completions/max_terminated_length": 1055.0, + "completions/mean_length": 690.6666870117188, + "completions/mean_terminated_length": 690.6666870117188, + "completions/min_length": 414.0, + "completions/min_terminated_length": 414.0, + "epoch": 0.2805291723202171, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.7838985399976082e-07, + "kl": 0.0, + "learning_rate": 3.6611456176673567e-07, + "loss": 0.0, + "num_tokens": 28246261.0, + "reward": 1.2000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 827 + }, + { + "completion_length": 2305.416748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6477.0, + "completions/max_terminated_length": 6477.0, + "completions/mean_length": 2305.416748046875, + "completions/mean_terminated_length": 2305.416748046875, + "completions/min_length": 398.0, + "completions/min_terminated_length": 398.0, + "epoch": 0.2808683853459973, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.897094190120697, + "kl": 0.0, + "learning_rate": 3.659420289855072e-07, + "loss": -0.0296, + "num_tokens": 28284372.0, + "reward": 0.8000000715255737, + "reward_std": 0.24494895339012146, + "rewards/correctness_reward_func/mean": 0.5, + "rewards/correctness_reward_func/std": 0.4472135901451111, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 828 + }, + { + "completion_length": 3132.916748046875, + "completions/clipped_ratio": 0.25, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 6016.0, + "completions/mean_length": 4780.1669921875, + "completions/mean_terminated_length": 4177.22216796875, + "completions/min_length": 2212.0, + "completions/min_terminated_length": 2212.0, + "epoch": 0.2812075983717775, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5745806097984314, + "kl": NaN, + "learning_rate": 3.657694962042788e-07, + "loss": -0.0817, + "num_tokens": 28334783.0, + "reward": 0.9000000953674316, + "reward_std": 0.3510836958885193, + "rewards/correctness_reward_func/mean": 0.6500000357627869, + "rewards/correctness_reward_func/std": 0.40113475918769836, + "rewards/format_reward_func/mean": 0.2500000298023224, + "rewards/format_reward_func/std": 0.11677484214305878, + "step": 829 + }, + { + "completion_length": 1060.5833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1373.0, + "completions/max_terminated_length": 1373.0, + "completions/mean_length": 1060.5833740234375, + "completions/mean_terminated_length": 1060.5833740234375, + "completions/min_length": 712.0, + "completions/min_terminated_length": 712.0, + "epoch": 0.28154681139755766, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 3.6559696342305033e-07, + "loss": 0.0, + "num_tokens": 28357872.0, + "reward": 1.2999999523162842, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 1.0, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 830 + }, + { + "completion_length": 1614.5833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3408.0, + "completions/max_terminated_length": 3408.0, + "completions/mean_length": 1614.5833740234375, + "completions/mean_terminated_length": 1614.5833740234375, + "completions/min_length": 524.0, + "completions/min_terminated_length": 524.0, + "epoch": 0.2818860244233379, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.05196267366409302, + "kl": 0.0, + "learning_rate": 3.6542443064182193e-07, + "loss": -0.0006, + "num_tokens": 28387807.0, + "reward": 1.183333396911621, + "reward_std": 0.040824808180332184, + "rewards/correctness_reward_func/mean": 0.8833333849906921, + "rewards/correctness_reward_func/std": 0.10298572480678558, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 831 + }, + { + "completion_length": 1116.5, + "completions/clipped_ratio": 0.25, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 3091.0, + "completions/mean_length": 2763.75, + "completions/mean_terminated_length": 1488.6666259765625, + "completions/min_length": 654.0, + "completions/min_terminated_length": 654.0, + "epoch": 0.28222523744911804, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0311174392700195, + "kl": NaN, + "learning_rate": 3.6525189786059354e-07, + "loss": -0.0566, + "num_tokens": 28413589.0, + "reward": 0.4749999940395355, + "reward_std": 0.35601967573165894, + "rewards/correctness_reward_func/mean": 0.25, + "rewards/correctness_reward_func/std": 0.45226702094078064, + "rewards/format_reward_func/mean": 0.22500000894069672, + "rewards/format_reward_func/std": 0.13568010926246643, + "step": 832 + }, + { + "completion_length": 905.0000305175781, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1330.0, + "completions/max_terminated_length": 1330.0, + "completions/mean_length": 905.0, + "completions/mean_terminated_length": 905.0, + "completions/min_length": 630.0, + "completions/min_terminated_length": 630.0, + "epoch": 0.28256445047489825, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.9492409225895244e-07, + "kl": 0.0, + "learning_rate": 3.6507936507936504e-07, + "loss": 0.0, + "num_tokens": 28437493.0, + "reward": 1.1000001430511475, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 833 + }, + { + "completion_length": 1761.2500610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3188.0, + "completions/max_terminated_length": 3188.0, + "completions/mean_length": 1761.25, + "completions/mean_terminated_length": 1761.25, + "completions/min_length": 734.0, + "completions/min_terminated_length": 734.0, + "epoch": 0.2829036635006784, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.07774775475263596, + "kl": 0.0, + "learning_rate": 3.6490683229813664e-07, + "loss": -0.0002, + "num_tokens": 28470862.0, + "reward": 1.133333444595337, + "reward_std": 0.05163976177573204, + "rewards/correctness_reward_func/mean": 0.8333333134651184, + "rewards/correctness_reward_func/std": 0.07784988731145859, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 834 + }, + { + "completion_length": 758.8333435058594, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1266.0, + "completions/max_terminated_length": 1266.0, + "completions/mean_length": 758.8333740234375, + "completions/mean_terminated_length": 758.8333740234375, + "completions/min_length": 454.0, + "completions/min_terminated_length": 454.0, + "epoch": 0.28324287652645863, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.0557556606727303e-07, + "kl": 0.0, + "learning_rate": 3.647342995169082e-07, + "loss": 0.0, + "num_tokens": 28494248.0, + "reward": 1.2000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 835 + }, + { + "completion_length": 2812.0834350585938, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6122.0, + "completions/max_terminated_length": 6122.0, + "completions/mean_length": 2812.08349609375, + "completions/mean_terminated_length": 2812.08349609375, + "completions/min_length": 758.0, + "completions/min_terminated_length": 758.0, + "epoch": 0.2835820895522388, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.5875731706619263, + "kl": 0.0, + "learning_rate": 3.645617667356798e-07, + "loss": 0.0046, + "num_tokens": 28534869.0, + "reward": 0.9541667699813843, + "reward_std": 0.19900795817375183, + "rewards/correctness_reward_func/mean": 0.6666666865348816, + "rewards/correctness_reward_func/std": 0.31139957904815674, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 836 + }, + { + "completion_length": 1912.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5484.0, + "completions/max_terminated_length": 5484.0, + "completions/mean_length": 1912.25, + "completions/mean_terminated_length": 1912.25, + "completions/min_length": 714.0, + "completions/min_terminated_length": 714.0, + "epoch": 0.283921302578019, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.538487879315653e-07, + "kl": 0.0, + "learning_rate": 3.643892339544513e-07, + "loss": 0.0, + "num_tokens": 28567734.0, + "reward": 1.2000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 837 + }, + { + "completion_length": 942.0000305175781, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1422.0, + "completions/max_terminated_length": 1422.0, + "completions/mean_length": 942.0, + "completions/mean_terminated_length": 942.0, + "completions/min_length": 564.0, + "completions/min_terminated_length": 564.0, + "epoch": 0.28426051560379917, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.08199096471071243, + "kl": 0.0, + "learning_rate": 3.642167011732229e-07, + "loss": 0.0005, + "num_tokens": 28595202.0, + "reward": 0.7333333492279053, + "reward_std": 0.05163976177573204, + "rewards/correctness_reward_func/mean": 0.43333330750465393, + "rewards/correctness_reward_func/std": 0.45792683959007263, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 838 + }, + { + "completion_length": 2428.9166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4072.0, + "completions/max_terminated_length": 4072.0, + "completions/mean_length": 2428.916748046875, + "completions/mean_terminated_length": 2428.916748046875, + "completions/min_length": 1415.0, + "completions/min_terminated_length": 1415.0, + "epoch": 0.2845997286295794, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.8435646891593933, + "kl": 0.0, + "learning_rate": 3.6404416839199446e-07, + "loss": 0.0061, + "num_tokens": 28631879.0, + "reward": 0.8666667342185974, + "reward_std": 0.2581988573074341, + "rewards/correctness_reward_func/mean": 0.5666666626930237, + "rewards/correctness_reward_func/std": 0.42497774958610535, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 839 + }, + { + "completion_length": 1648.666748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5064.0, + "completions/max_terminated_length": 5064.0, + "completions/mean_length": 1648.666748046875, + "completions/mean_terminated_length": 1648.666748046875, + "completions/min_length": 652.0, + "completions/min_terminated_length": 652.0, + "epoch": 0.28493894165535955, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.09471841156482697, + "kl": 0.0, + "learning_rate": 3.63871635610766e-07, + "loss": -0.0025, + "num_tokens": 28661653.0, + "reward": 1.1666667461395264, + "reward_std": 0.051639750599861145, + "rewards/correctness_reward_func/mean": 0.8666666150093079, + "rewards/correctness_reward_func/std": 0.0984731912612915, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 840 + }, + { + "completion_length": 1834.8333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2920.0, + "completions/max_terminated_length": 2920.0, + "completions/mean_length": 1834.8333740234375, + "completions/mean_terminated_length": 1834.8333740234375, + "completions/min_length": 470.0, + "completions/min_terminated_length": 470.0, + "epoch": 0.28527815468113976, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0629253163933754, + "kl": 0.0, + "learning_rate": 3.6369910282953757e-07, + "loss": 0.0021, + "num_tokens": 28697795.0, + "reward": 1.1166667938232422, + "reward_std": 0.040824826806783676, + "rewards/correctness_reward_func/mean": 0.8166666626930237, + "rewards/correctness_reward_func/std": 0.05773502215743065, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 841 + }, + { + "completion_length": 1036.1667175292969, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1895.0, + "completions/max_terminated_length": 1895.0, + "completions/mean_length": 1036.166748046875, + "completions/mean_terminated_length": 1036.166748046875, + "completions/min_length": 578.0, + "completions/min_terminated_length": 578.0, + "epoch": 0.2856173677069199, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.09502989798784256, + "kl": 0.0, + "learning_rate": 3.6352657004830917e-07, + "loss": -0.0, + "num_tokens": 28721341.0, + "reward": 1.2333333492279053, + "reward_std": 0.051639772951602936, + "rewards/correctness_reward_func/mean": 0.9333333373069763, + "rewards/correctness_reward_func/std": 0.0984731912612915, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 842 + }, + { + "completion_length": 898.9166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1205.0, + "completions/max_terminated_length": 1205.0, + "completions/mean_length": 898.9166870117188, + "completions/mean_terminated_length": 898.9166870117188, + "completions/min_length": 597.0, + "completions/min_terminated_length": 597.0, + "epoch": 0.28595658073270014, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.972598036256386e-08, + "kl": 0.0, + "learning_rate": 3.633540372670807e-07, + "loss": 0.0, + "num_tokens": 28743330.0, + "reward": 1.2000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 843 + }, + { + "completion_length": 1803.7500610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5706.0, + "completions/max_terminated_length": 5706.0, + "completions/mean_length": 1803.75, + "completions/mean_terminated_length": 1803.75, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 0.2862957937584803, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.4343622028827667, + "kl": 0.0, + "learning_rate": 3.631815044858523e-07, + "loss": 0.0283, + "num_tokens": 28779027.0, + "reward": 0.38333332538604736, + "reward_std": 0.20412413775920868, + "rewards/correctness_reward_func/mean": 0.0833333358168602, + "rewards/correctness_reward_func/std": 0.28867512941360474, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 844 + }, + { + "completion_length": 1681.7500610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3518.0, + "completions/max_terminated_length": 3518.0, + "completions/mean_length": 1681.75, + "completions/mean_terminated_length": 1681.75, + "completions/min_length": 950.0, + "completions/min_terminated_length": 950.0, + "epoch": 0.2866350067842605, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.07720570266246796, + "kl": 0.0, + "learning_rate": 3.6300897170462383e-07, + "loss": -0.0016, + "num_tokens": 28814628.0, + "reward": 1.183333396911621, + "reward_std": 0.040824808180332184, + "rewards/correctness_reward_func/mean": 0.8833333849906921, + "rewards/correctness_reward_func/std": 0.10298572480678558, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 845 + }, + { + "completion_length": 1228.666748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2419.0, + "completions/max_terminated_length": 2419.0, + "completions/mean_length": 1228.666748046875, + "completions/mean_terminated_length": 1228.666748046875, + "completions/min_length": 403.0, + "completions/min_terminated_length": 403.0, + "epoch": 0.28697421981004073, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.1242604358585595e-07, + "kl": 0.0, + "learning_rate": 3.6283643892339544e-07, + "loss": 0.0, + "num_tokens": 28844198.0, + "reward": 1.2000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 846 + }, + { + "completion_length": 2611.666748046875, + "completions/clipped_ratio": 0.16666666666666663, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 6019.0, + "completions/mean_length": 3709.83349609375, + "completions/mean_terminated_length": 3134.0, + "completions/min_length": 1123.0, + "completions/min_terminated_length": 1123.0, + "epoch": 0.2873134328358209, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6555944085121155, + "kl": NaN, + "learning_rate": 3.6266390614216704e-07, + "loss": -0.0615, + "num_tokens": 28886206.0, + "reward": 0.7500001192092896, + "reward_std": 0.280963659286499, + "rewards/correctness_reward_func/mean": 0.5, + "rewards/correctness_reward_func/std": 0.4472135901451111, + "rewards/format_reward_func/mean": 0.25, + "rewards/format_reward_func/std": 0.11677484214305878, + "step": 847 + }, + { + "completion_length": 774.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2040.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 774.75, + "completions/mean_terminated_length": 774.75, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "epoch": 0.2876526458616011, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.7893466974783223e-07, + "kl": 0.0, + "learning_rate": 3.6249137336093854e-07, + "loss": 0.0, + "num_tokens": 28903849.0, + "reward": 1.2000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 848 + }, + { + "completion_length": 2093.2501220703125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4633.0, + "completions/max_terminated_length": 4633.0, + "completions/mean_length": 2093.25, + "completions/mean_terminated_length": 2093.25, + "completions/min_length": 529.0, + "completions/min_terminated_length": 529.0, + "epoch": 0.28799185888738127, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 3.6231884057971015e-07, + "loss": 0.0, + "num_tokens": 28938424.0, + "reward": 1.2999999523162842, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 1.0, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 849 + }, + { + "completion_length": 2106.7501220703125, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 4592.0, + "completions/mean_length": 2655.83349609375, + "completions/mean_terminated_length": 2298.272705078125, + "completions/min_length": 1069.0, + "completions/min_terminated_length": 1069.0, + "epoch": 0.2883310719131615, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.1451241672039032, + "kl": NaN, + "learning_rate": 3.621463077984817e-07, + "loss": -0.0101, + "num_tokens": 28975171.0, + "reward": 0.6750000715255737, + "reward_std": 0.06123724579811096, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.08660253882408142, + "step": 850 + }, + { + "completion_length": 1222.5833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2122.0, + "completions/max_terminated_length": 2122.0, + "completions/mean_length": 1222.5833740234375, + "completions/mean_terminated_length": 1222.5833740234375, + "completions/min_length": 738.0, + "completions/min_terminated_length": 738.0, + "epoch": 0.28867028493894165, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.7627006343445828e-07, + "kl": 0.0, + "learning_rate": 3.6197377501725325e-07, + "loss": 0.0, + "num_tokens": 29000672.0, + "reward": 1.1000001430511475, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 851 + }, + { + "completion_length": 1198.0000610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3363.0, + "completions/max_terminated_length": 3363.0, + "completions/mean_length": 1198.0, + "completions/mean_terminated_length": 1198.0, + "completions/min_length": 603.0, + "completions/min_terminated_length": 603.0, + "epoch": 0.28900949796472186, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.040780119597911835, + "kl": 0.0, + "learning_rate": 3.618012422360248e-07, + "loss": -0.0002, + "num_tokens": 29028500.0, + "reward": 0.2875000238418579, + "reward_std": 0.03061862289905548, + "rewards/correctness_reward_func/mean": 0.0, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.2875000238418579, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 852 + }, + { + "completion_length": 3095.416748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5384.0, + "completions/max_terminated_length": 5384.0, + "completions/mean_length": 3095.416748046875, + "completions/mean_terminated_length": 3095.416748046875, + "completions/min_length": 977.0, + "completions/min_terminated_length": 977.0, + "epoch": 0.289348710990502, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6323304772377014, + "kl": 0.0, + "learning_rate": 3.616287094547964e-07, + "loss": -0.0108, + "num_tokens": 29076835.0, + "reward": 0.7166666984558105, + "reward_std": 0.20412413775920868, + "rewards/correctness_reward_func/mean": 0.4166666567325592, + "rewards/correctness_reward_func/std": 0.5149286985397339, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 853 + }, + { + "completion_length": 938.5000305175781, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1590.0, + "completions/max_terminated_length": 1590.0, + "completions/mean_length": 938.5, + "completions/mean_terminated_length": 938.5, + "completions/min_length": 531.0, + "completions/min_terminated_length": 531.0, + "epoch": 0.28968792401628224, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.98534375185045e-08, + "kl": 0.0, + "learning_rate": 3.6145617667356797e-07, + "loss": 0.0, + "num_tokens": 29098753.0, + "reward": 1.2000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 854 + }, + { + "completion_length": 1426.1667175292969, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3073.0, + "completions/max_terminated_length": 3073.0, + "completions/mean_length": 1426.166748046875, + "completions/mean_terminated_length": 1426.166748046875, + "completions/min_length": 518.0, + "completions/min_terminated_length": 518.0, + "epoch": 0.2900271370420624, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.5459246444370365e-07, + "kl": 0.0, + "learning_rate": 3.612836438923395e-07, + "loss": 0.0, + "num_tokens": 29129841.0, + "reward": 1.2000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 855 + }, + { + "completion_length": 833.8333435058594, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1799.0, + "completions/max_terminated_length": 1799.0, + "completions/mean_length": 833.8333740234375, + "completions/mean_terminated_length": 833.8333740234375, + "completions/min_length": 408.0, + "completions/min_terminated_length": 408.0, + "epoch": 0.2903663500678426, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 3.6111111111111107e-07, + "loss": 0.0, + "num_tokens": 29150641.0, + "reward": 0.7999999523162842, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.5, + "rewards/correctness_reward_func/std": 0.5222329497337341, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 856 + }, + { + "completion_length": 1173.1667175292969, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3143.0, + "completions/max_terminated_length": 3143.0, + "completions/mean_length": 1173.166748046875, + "completions/mean_terminated_length": 1173.166748046875, + "completions/min_length": 670.0, + "completions/min_terminated_length": 670.0, + "epoch": 0.2907055630936228, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.09201031923294067, + "kl": 0.0, + "learning_rate": 3.609385783298827e-07, + "loss": 0.0009, + "num_tokens": 29175879.0, + "reward": 1.25, + "reward_std": 0.05477222427725792, + "rewards/correctness_reward_func/mean": 0.949999988079071, + "rewards/correctness_reward_func/std": 0.09045339375734329, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 857 + }, + { + "completion_length": 1476.4166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2281.0, + "completions/max_terminated_length": 2281.0, + "completions/mean_length": 1476.416748046875, + "completions/mean_terminated_length": 1476.416748046875, + "completions/min_length": 635.0, + "completions/min_terminated_length": 635.0, + "epoch": 0.291044776119403, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.1626907081563331e-07, + "kl": 0.0, + "learning_rate": 3.6076604554865423e-07, + "loss": 0.0, + "num_tokens": 29205764.0, + "reward": 0.7000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 858 + }, + { + "completion_length": 2051.3333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3636.0, + "completions/max_terminated_length": 3636.0, + "completions/mean_length": 2051.33349609375, + "completions/mean_terminated_length": 2051.33349609375, + "completions/min_length": 434.0, + "completions/min_terminated_length": 434.0, + "epoch": 0.29138398914518315, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.4645165503025055, + "kl": 0.0, + "learning_rate": 3.605935127674258e-07, + "loss": -0.0135, + "num_tokens": 29241348.0, + "reward": 0.38333332538604736, + "reward_std": 0.20412413775920868, + "rewards/correctness_reward_func/mean": 0.0833333358168602, + "rewards/correctness_reward_func/std": 0.28867512941360474, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 859 + }, + { + "completion_length": 2124.3334350585938, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5150.0, + "completions/mean_length": 2673.416748046875, + "completions/mean_terminated_length": 2317.45458984375, + "completions/min_length": 672.0, + "completions/min_terminated_length": 672.0, + "epoch": 0.29172320217096337, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.17607992887496948, + "kl": NaN, + "learning_rate": 3.6042097998619734e-07, + "loss": -0.0124, + "num_tokens": 29280682.0, + "reward": 0.7749999761581421, + "reward_std": 0.06123724579811096, + "rewards/correctness_reward_func/mean": 0.5, + "rewards/correctness_reward_func/std": 0.5222329497337341, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.08660253882408142, + "step": 860 + }, + { + "completion_length": 2468.416748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5737.0, + "completions/max_terminated_length": 5737.0, + "completions/mean_length": 2468.416748046875, + "completions/mean_terminated_length": 2468.416748046875, + "completions/min_length": 462.0, + "completions/min_terminated_length": 462.0, + "epoch": 0.29206241519674353, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.25238487124443054, + "kl": 0.0, + "learning_rate": 3.6024844720496894e-07, + "loss": 0.0017, + "num_tokens": 29323107.0, + "reward": 1.1166667938232422, + "reward_std": 0.20412413775920868, + "rewards/correctness_reward_func/mean": 0.8166666030883789, + "rewards/correctness_reward_func/std": 0.27579087018966675, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 861 + }, + { + "completion_length": 1086.0833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2235.0, + "completions/max_terminated_length": 2235.0, + "completions/mean_length": 1086.0833740234375, + "completions/mean_terminated_length": 1086.0833740234375, + "completions/min_length": 657.0, + "completions/min_terminated_length": 657.0, + "epoch": 0.29240162822252375, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.09536635130643845, + "kl": 0.0, + "learning_rate": 3.600759144237405e-07, + "loss": -0.0001, + "num_tokens": 29345398.0, + "reward": 1.1666667461395264, + "reward_std": 0.051639750599861145, + "rewards/correctness_reward_func/mean": 0.8666666150093079, + "rewards/correctness_reward_func/std": 0.0984731912612915, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 862 + }, + { + "completion_length": 2525.0833740234375, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 4973.0, + "completions/mean_length": 3074.166748046875, + "completions/mean_terminated_length": 2754.636474609375, + "completions/min_length": 1133.0, + "completions/min_terminated_length": 1133.0, + "epoch": 0.29274084124830396, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4000323414802551, + "kl": NaN, + "learning_rate": 3.5990338164251205e-07, + "loss": -0.0039, + "num_tokens": 29389211.0, + "reward": 0.6416666507720947, + "reward_std": 0.2474271059036255, + "rewards/correctness_reward_func/mean": 0.36666664481163025, + "rewards/correctness_reward_func/std": 0.45792683959007263, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.08660253882408142, + "step": 863 + }, + { + "completion_length": 940.5833435058594, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1529.0, + "completions/max_terminated_length": 1529.0, + "completions/mean_length": 940.5833740234375, + "completions/mean_terminated_length": 940.5833740234375, + "completions/min_length": 399.0, + "completions/min_terminated_length": 399.0, + "epoch": 0.2930800542740841, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.42567434906959534, + "kl": 0.0, + "learning_rate": 3.5973084886128365e-07, + "loss": 0.0036, + "num_tokens": 29410548.0, + "reward": 1.058333396911621, + "reward_std": 0.2239791601896286, + "rewards/correctness_reward_func/mean": 0.7833333015441895, + "rewards/correctness_reward_func/std": 0.26227444410324097, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.05838742107152939, + "step": 864 + }, + { + "completion_length": 1378.2500610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2369.0, + "completions/max_terminated_length": 2369.0, + "completions/mean_length": 1378.25, + "completions/mean_terminated_length": 1378.25, + "completions/min_length": 692.0, + "completions/min_terminated_length": 692.0, + "epoch": 0.29341926729986434, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.08405417203903198, + "kl": 0.0, + "learning_rate": 3.595583160800552e-07, + "loss": -0.0007, + "num_tokens": 29439381.0, + "reward": 1.133333444595337, + "reward_std": 0.051639772951602936, + "rewards/correctness_reward_func/mean": 0.8333333134651184, + "rewards/correctness_reward_func/std": 0.07784988731145859, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 865 + }, + { + "completion_length": 1817.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3688.0, + "completions/max_terminated_length": 3688.0, + "completions/mean_length": 1817.5, + "completions/mean_terminated_length": 1817.5, + "completions/min_length": 645.0, + "completions/min_terminated_length": 645.0, + "epoch": 0.2937584803256445, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.4541046172998904e-07, + "kl": 0.0, + "learning_rate": 3.5938578329882676e-07, + "loss": 0.0, + "num_tokens": 29472459.0, + "reward": 0.7000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 866 + }, + { + "completion_length": 1045.7500610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3683.0, + "completions/max_terminated_length": 3683.0, + "completions/mean_length": 1045.75, + "completions/mean_terminated_length": 1045.75, + "completions/min_length": 443.0, + "completions/min_terminated_length": 443.0, + "epoch": 0.2940976933514247, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0929461270570755, + "kl": 0.0, + "learning_rate": 3.592132505175983e-07, + "loss": -0.0029, + "num_tokens": 29493606.0, + "reward": 0.7666666507720947, + "reward_std": 0.051639750599861145, + "rewards/correctness_reward_func/mean": 0.46666666865348816, + "rewards/correctness_reward_func/std": 0.4923659563064575, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 867 + }, + { + "completion_length": 1191.0833435058594, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2802.0, + "completions/max_terminated_length": 2802.0, + "completions/mean_length": 1191.0833740234375, + "completions/mean_terminated_length": 1191.0833740234375, + "completions/min_length": 373.0, + "completions/min_terminated_length": 373.0, + "epoch": 0.2944369063772049, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.08606947213411331, + "kl": 0.0, + "learning_rate": 3.590407177363699e-07, + "loss": -0.0002, + "num_tokens": 29520937.0, + "reward": 1.2833333015441895, + "reward_std": 0.040824808180332184, + "rewards/correctness_reward_func/mean": 0.9833333492279053, + "rewards/correctness_reward_func/std": 0.05773502215743065, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 868 + }, + { + "completion_length": 2072.5000610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4117.0, + "completions/max_terminated_length": 4117.0, + "completions/mean_length": 2072.5, + "completions/mean_terminated_length": 2072.5, + "completions/min_length": 475.0, + "completions/min_terminated_length": 475.0, + "epoch": 0.2947761194029851, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15073060989379883, + "kl": 0.0, + "learning_rate": 3.5886818495514147e-07, + "loss": -0.0003, + "num_tokens": 29560627.0, + "reward": 1.2333333492279053, + "reward_std": 0.0955970510840416, + "rewards/correctness_reward_func/mean": 0.9333333373069763, + "rewards/correctness_reward_func/std": 0.0984731912612915, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 869 + }, + { + "completion_length": 1233.9166870117188, + "completions/clipped_ratio": 0.16666666666666663, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 2920.0, + "completions/mean_length": 2332.08349609375, + "completions/mean_terminated_length": 1480.7000732421875, + "completions/min_length": 969.0, + "completions/min_terminated_length": 969.0, + "epoch": 0.29511533242876525, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.3028711974620819, + "kl": NaN, + "learning_rate": 3.58695652173913e-07, + "loss": -0.0307, + "num_tokens": 29584182.0, + "reward": 1.0166666507720947, + "reward_std": 0.2840188145637512, + "rewards/correctness_reward_func/mean": 0.7666666507720947, + "rewards/correctness_reward_func/std": 0.3700941801071167, + "rewards/format_reward_func/mean": 0.25, + "rewards/format_reward_func/std": 0.11677484214305878, + "step": 870 + }, + { + "completion_length": 1574.4166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6154.0, + "completions/max_terminated_length": 6154.0, + "completions/mean_length": 1574.416748046875, + "completions/mean_terminated_length": 1574.416748046875, + "completions/min_length": 503.0, + "completions/min_terminated_length": 503.0, + "epoch": 0.29545454545454547, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.661939263343811, + "kl": 0.0, + "learning_rate": 3.585231193926846e-07, + "loss": 0.0737, + "num_tokens": 29614547.0, + "reward": 1.1166666746139526, + "reward_std": 0.29902368783950806, + "rewards/correctness_reward_func/mean": 0.8166666626930237, + "rewards/correctness_reward_func/std": 0.3857303261756897, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 871 + }, + { + "completion_length": 1510.6666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3076.0, + "completions/max_terminated_length": 3076.0, + "completions/mean_length": 1510.666748046875, + "completions/mean_terminated_length": 1510.666748046875, + "completions/min_length": 427.0, + "completions/min_terminated_length": 427.0, + "epoch": 0.29579375848032563, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.5096197128295898, + "kl": 0.0, + "learning_rate": 3.583505866114562e-07, + "loss": 0.0023, + "num_tokens": 29647225.0, + "reward": 1.2166666984558105, + "reward_std": 0.20412413775920868, + "rewards/correctness_reward_func/mean": 0.9166666865348816, + "rewards/correctness_reward_func/std": 0.28867512941360474, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 872 + }, + { + "completion_length": 3703.3333740234375, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 6485.0, + "completions/mean_length": 4252.4169921875, + "completions/mean_terminated_length": 4040.0, + "completions/min_length": 1202.0, + "completions/min_terminated_length": 1202.0, + "epoch": 0.29613297150610585, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.2425873577594757, + "kl": NaN, + "learning_rate": 3.581780538302277e-07, + "loss": -0.0457, + "num_tokens": 29703071.0, + "reward": 0.625, + "reward_std": 0.23611438274383545, + "rewards/correctness_reward_func/mean": 0.3499999940395355, + "rewards/correctness_reward_func/std": 0.4358898997306824, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.08660253882408142, + "step": 873 + }, + { + "completion_length": 1448.6666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2933.0, + "completions/max_terminated_length": 2933.0, + "completions/mean_length": 1448.666748046875, + "completions/mean_terminated_length": 1448.666748046875, + "completions/min_length": 386.0, + "completions/min_terminated_length": 386.0, + "epoch": 0.296472184531886, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0715579092502594, + "kl": 0.0, + "learning_rate": 3.580055210489993e-07, + "loss": -0.0002, + "num_tokens": 29735509.0, + "reward": 1.1666667461395264, + "reward_std": 0.05163975805044174, + "rewards/correctness_reward_func/mean": 0.8666666150093079, + "rewards/correctness_reward_func/std": 0.0984731912612915, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 874 + }, + { + "completion_length": 821.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1282.0, + "completions/max_terminated_length": 1282.0, + "completions/mean_length": 821.5, + "completions/mean_terminated_length": 821.5, + "completions/min_length": 485.0, + "completions/min_terminated_length": 485.0, + "epoch": 0.2968113975576662, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.07250086963176727, + "kl": 0.0, + "learning_rate": 3.5783298826777084e-07, + "loss": -0.0009, + "num_tokens": 29759443.0, + "reward": 1.1666667461395264, + "reward_std": 0.05163975805044174, + "rewards/correctness_reward_func/mean": 0.8666666150093079, + "rewards/correctness_reward_func/std": 0.0984731912612915, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 875 + }, + { + "completion_length": 1254.8333740234375, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 3132.0, + "completions/mean_length": 1803.916748046875, + "completions/mean_terminated_length": 1368.9091796875, + "completions/min_length": 446.0, + "completions/min_terminated_length": 446.0, + "epoch": 0.2971506105834464, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.47821852564811707, + "kl": NaN, + "learning_rate": 3.5766045548654245e-07, + "loss": -0.0038, + "num_tokens": 29784725.0, + "reward": 0.6083333492279053, + "reward_std": 0.3006936311721802, + "rewards/correctness_reward_func/mean": 0.3333333432674408, + "rewards/correctness_reward_func/std": 0.4923659563064575, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.08660253882408142, + "step": 876 + }, + { + "completion_length": 1243.7500610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1934.0, + "completions/max_terminated_length": 1934.0, + "completions/mean_length": 1243.75, + "completions/mean_terminated_length": 1243.75, + "completions/min_length": 563.0, + "completions/min_terminated_length": 563.0, + "epoch": 0.2974898236092266, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5256324410438538, + "kl": 0.0, + "learning_rate": 3.5748792270531395e-07, + "loss": -0.0066, + "num_tokens": 29811872.0, + "reward": 1.0916666984558105, + "reward_std": 0.26536136865615845, + "rewards/correctness_reward_func/mean": 0.8166666030883789, + "rewards/correctness_reward_func/std": 0.27579087018966675, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.05838742107152939, + "step": 877 + }, + { + "completion_length": 1533.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5486.0, + "completions/max_terminated_length": 5486.0, + "completions/mean_length": 1533.0, + "completions/mean_terminated_length": 1533.0, + "completions/min_length": 413.0, + "completions/min_terminated_length": 413.0, + "epoch": 0.29782903663500676, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.3232329785823822, + "kl": 0.0, + "learning_rate": 3.5731538992408555e-07, + "loss": -0.0113, + "num_tokens": 29842292.0, + "reward": 0.6333333253860474, + "reward_std": 0.1632993221282959, + "rewards/correctness_reward_func/mean": 0.3333333432674408, + "rewards/correctness_reward_func/std": 0.4119429290294647, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 878 + }, + { + "completion_length": 1707.3333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3730.0, + "completions/max_terminated_length": 3730.0, + "completions/mean_length": 1707.3333740234375, + "completions/mean_terminated_length": 1707.3333740234375, + "completions/min_length": 744.0, + "completions/min_terminated_length": 744.0, + "epoch": 0.298168249660787, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.7474680191753578e-07, + "kl": 0.0, + "learning_rate": 3.5714285714285716e-07, + "loss": 0.0, + "num_tokens": 29872560.0, + "reward": 0.7000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 879 + }, + { + "completion_length": 1907.8334350585938, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5257.0, + "completions/max_terminated_length": 5257.0, + "completions/mean_length": 1907.8333740234375, + "completions/mean_terminated_length": 1907.8333740234375, + "completions/min_length": 385.0, + "completions/min_terminated_length": 385.0, + "epoch": 0.29850746268656714, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.42220346283284e-07, + "kl": 0.0, + "learning_rate": 3.569703243616287e-07, + "loss": 0.0, + "num_tokens": 29907760.0, + "reward": 0.7000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 880 + }, + { + "completion_length": 1137.8333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1893.0, + "completions/max_terminated_length": 1893.0, + "completions/mean_length": 1137.8333740234375, + "completions/mean_terminated_length": 1137.8333740234375, + "completions/min_length": 456.0, + "completions/min_terminated_length": 456.0, + "epoch": 0.29884667571234735, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.066347137093544, + "kl": 0.0, + "learning_rate": 3.5679779158040026e-07, + "loss": 0.0011, + "num_tokens": 29931818.0, + "reward": 1.1166667938232422, + "reward_std": 0.040824826806783676, + "rewards/correctness_reward_func/mean": 0.8166666626930237, + "rewards/correctness_reward_func/std": 0.05773502215743065, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 881 + }, + { + "completion_length": 1808.0834350585938, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3643.0, + "completions/max_terminated_length": 3643.0, + "completions/mean_length": 1808.0833740234375, + "completions/mean_terminated_length": 1808.0833740234375, + "completions/min_length": 686.0, + "completions/min_terminated_length": 686.0, + "epoch": 0.29918588873812757, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 3.566252587991718e-07, + "loss": 0.0, + "num_tokens": 29967603.0, + "reward": 0.7999999523162842, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.5, + "rewards/correctness_reward_func/std": 0.5222329497337341, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 882 + }, + { + "completion_length": 1442.0833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1852.0, + "completions/max_terminated_length": 1852.0, + "completions/mean_length": 1442.0833740234375, + "completions/mean_terminated_length": 1442.0833740234375, + "completions/min_length": 820.0, + "completions/min_terminated_length": 820.0, + "epoch": 0.29952510176390773, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5057232975959778, + "kl": 0.0, + "learning_rate": 3.564527260179434e-07, + "loss": 0.0043, + "num_tokens": 30000094.0, + "reward": 1.1041667461395264, + "reward_std": 0.27090632915496826, + "rewards/correctness_reward_func/mean": 0.8166666030883789, + "rewards/correctness_reward_func/std": 0.27579087018966675, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 883 + }, + { + "completion_length": 1418.3333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2682.0, + "completions/max_terminated_length": 2682.0, + "completions/mean_length": 1418.3333740234375, + "completions/mean_terminated_length": 1418.3333740234375, + "completions/min_length": 675.0, + "completions/min_terminated_length": 675.0, + "epoch": 0.29986431478968795, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.0655014054682397e-07, + "kl": 0.0, + "learning_rate": 3.562801932367149e-07, + "loss": 0.0, + "num_tokens": 30025184.0, + "reward": 1.2000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 884 + }, + { + "completion_length": 926.2500305175781, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1558.0, + "completions/max_terminated_length": 1558.0, + "completions/mean_length": 926.25, + "completions/mean_terminated_length": 926.25, + "completions/min_length": 333.0, + "completions/min_terminated_length": 333.0, + "epoch": 0.3002035278154681, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.08455709367990494, + "kl": 0.0, + "learning_rate": 3.5610766045548653e-07, + "loss": 0.0011, + "num_tokens": 30046109.0, + "reward": 1.183333396911621, + "reward_std": 0.040824808180332184, + "rewards/correctness_reward_func/mean": 0.8833333849906921, + "rewards/correctness_reward_func/std": 0.10298572480678558, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 885 + }, + { + "completion_length": 983.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1365.0, + "completions/max_terminated_length": 1365.0, + "completions/mean_length": 983.0, + "completions/mean_terminated_length": 983.0, + "completions/min_length": 604.0, + "completions/min_terminated_length": 604.0, + "epoch": 0.3005427408412483, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.2232033813907037e-07, + "kl": 0.0, + "learning_rate": 3.559351276742581e-07, + "loss": 0.0, + "num_tokens": 30072803.0, + "reward": 0.7000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 886 + }, + { + "completion_length": 1092.6666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1479.0, + "completions/max_terminated_length": 1479.0, + "completions/mean_length": 1092.666748046875, + "completions/mean_terminated_length": 1092.666748046875, + "completions/min_length": 793.0, + "completions/min_terminated_length": 793.0, + "epoch": 0.3008819538670285, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1025518849492073, + "kl": 0.0, + "learning_rate": 3.557625948930297e-07, + "loss": -0.0005, + "num_tokens": 30095977.0, + "reward": 1.1666667461395264, + "reward_std": 0.09559705853462219, + "rewards/correctness_reward_func/mean": 0.8666666150093079, + "rewards/correctness_reward_func/std": 0.0984731912612915, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 887 + }, + { + "completion_length": 1043.3333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1950.0, + "completions/max_terminated_length": 1950.0, + "completions/mean_length": 1043.3333740234375, + "completions/mean_terminated_length": 1043.3333740234375, + "completions/min_length": 683.0, + "completions/min_terminated_length": 683.0, + "epoch": 0.3012211668928087, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.7373888283600536e-07, + "kl": 0.0, + "learning_rate": 3.555900621118012e-07, + "loss": 0.0, + "num_tokens": 30121943.0, + "reward": 1.2000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 888 + }, + { + "completion_length": 1952.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3501.0, + "completions/max_terminated_length": 3501.0, + "completions/mean_length": 1952.25, + "completions/mean_terminated_length": 1952.25, + "completions/min_length": 539.0, + "completions/min_terminated_length": 539.0, + "epoch": 0.30156037991858886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 3.554175293305728e-07, + "loss": 0.0, + "num_tokens": 30153266.0, + "reward": 0.7999999523162842, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.5, + "rewards/correctness_reward_func/std": 0.5222329497337341, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 889 + }, + { + "completion_length": 1630.6666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3602.0, + "completions/max_terminated_length": 3602.0, + "completions/mean_length": 1630.666748046875, + "completions/mean_terminated_length": 1630.666748046875, + "completions/min_length": 514.0, + "completions/min_terminated_length": 514.0, + "epoch": 0.3018995929443691, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.09205930680036545, + "kl": 0.0, + "learning_rate": 3.5524499654934434e-07, + "loss": -0.0002, + "num_tokens": 30189412.0, + "reward": 1.2833333015441895, + "reward_std": 0.040824808180332184, + "rewards/correctness_reward_func/mean": 0.9833333492279053, + "rewards/correctness_reward_func/std": 0.05773502215743065, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 890 + }, + { + "completion_length": 1857.916748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3834.0, + "completions/max_terminated_length": 3834.0, + "completions/mean_length": 1857.916748046875, + "completions/mean_terminated_length": 1857.916748046875, + "completions/min_length": 929.0, + "completions/min_terminated_length": 929.0, + "epoch": 0.30223880597014924, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.2736225585285865e-07, + "kl": 0.0, + "learning_rate": 3.5507246376811595e-07, + "loss": 0.0, + "num_tokens": 30221493.0, + "reward": 0.7000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 891 + }, + { + "completion_length": 1551.7500610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2618.0, + "completions/max_terminated_length": 2618.0, + "completions/mean_length": 1551.75, + "completions/mean_terminated_length": 1551.75, + "completions/min_length": 926.0, + "completions/min_terminated_length": 926.0, + "epoch": 0.30257801899592945, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.07093626260757446, + "kl": 0.0, + "learning_rate": 3.5489993098688745e-07, + "loss": 0.001, + "num_tokens": 30250272.0, + "reward": 0.7833333015441895, + "reward_std": 0.040824808180332184, + "rewards/correctness_reward_func/mean": 0.4833333492279053, + "rewards/correctness_reward_func/std": 0.5078176856040955, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 892 + }, + { + "completion_length": 2763.5833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5376.0, + "completions/max_terminated_length": 5376.0, + "completions/mean_length": 2763.58349609375, + "completions/mean_terminated_length": 2763.58349609375, + "completions/min_length": 1006.0, + "completions/min_terminated_length": 1006.0, + "epoch": 0.3029172320217096, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8445454835891724, + "kl": 0.0, + "learning_rate": 3.5472739820565906e-07, + "loss": 0.0196, + "num_tokens": 30297445.0, + "reward": 1.0333333015441895, + "reward_std": 0.4581989049911499, + "rewards/correctness_reward_func/mean": 0.7333333492279053, + "rewards/correctness_reward_func/std": 0.44585633277893066, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 893 + }, + { + "completion_length": 744.4166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1564.0, + "completions/max_terminated_length": 1564.0, + "completions/mean_length": 744.4166870117188, + "completions/mean_terminated_length": 744.4166870117188, + "completions/min_length": 425.0, + "completions/min_terminated_length": 425.0, + "epoch": 0.30325644504748983, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.08562467247247696, + "kl": 0.0, + "learning_rate": 3.5455486542443066e-07, + "loss": 0.0003, + "num_tokens": 30324564.0, + "reward": 1.2666666507720947, + "reward_std": 0.05163975805044174, + "rewards/correctness_reward_func/mean": 0.9666666984558105, + "rewards/correctness_reward_func/std": 0.07784988731145859, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 894 + }, + { + "completion_length": 1995.9166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4017.0, + "completions/max_terminated_length": 4017.0, + "completions/mean_length": 1995.916748046875, + "completions/mean_terminated_length": 1995.916748046875, + "completions/min_length": 585.0, + "completions/min_terminated_length": 585.0, + "epoch": 0.30359565807327, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.11056003719568253, + "kl": 0.0, + "learning_rate": 3.543823326432022e-07, + "loss": 0.0017, + "num_tokens": 30357587.0, + "reward": 0.75, + "reward_std": 0.05477223917841911, + "rewards/correctness_reward_func/mean": 0.45000001788139343, + "rewards/correctness_reward_func/std": 0.47577688097953796, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 895 + }, + { + "completion_length": 1346.4166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6012.0, + "completions/max_terminated_length": 6012.0, + "completions/mean_length": 1346.416748046875, + "completions/mean_terminated_length": 1346.416748046875, + "completions/min_length": 419.0, + "completions/min_terminated_length": 419.0, + "epoch": 0.3039348710990502, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.997124433517456, + "kl": 0.0, + "learning_rate": 3.5420979986197377e-07, + "loss": 0.0458, + "num_tokens": 30385786.0, + "reward": 1.2000000476837158, + "reward_std": 0.20000001788139343, + "rewards/correctness_reward_func/mean": 0.9000000357627869, + "rewards/correctness_reward_func/std": 0.28919950127601624, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 896 + }, + { + "completion_length": 1527.7500610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5303.0, + "completions/max_terminated_length": 5303.0, + "completions/mean_length": 1527.75, + "completions/mean_terminated_length": 1527.75, + "completions/min_length": 634.0, + "completions/min_terminated_length": 634.0, + "epoch": 0.30427408412483037, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.4941687285900116, + "kl": 0.0, + "learning_rate": 3.540372670807453e-07, + "loss": 0.0035, + "num_tokens": 30416203.0, + "reward": 0.7166666984558105, + "reward_std": 0.20412413775920868, + "rewards/correctness_reward_func/mean": 0.4166666567325592, + "rewards/correctness_reward_func/std": 0.5149286985397339, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 897 + }, + { + "completion_length": 1980.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6500.0, + "completions/max_terminated_length": 6500.0, + "completions/mean_length": 1980.0, + "completions/mean_terminated_length": 1980.0, + "completions/min_length": 618.0, + "completions/min_terminated_length": 618.0, + "epoch": 0.3046132971506106, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.8511844873428345, + "kl": 0.0, + "learning_rate": 3.538647342995169e-07, + "loss": 0.0584, + "num_tokens": 30449485.0, + "reward": 0.7166666984558105, + "reward_std": 0.20412413775920868, + "rewards/correctness_reward_func/mean": 0.4166666567325592, + "rewards/correctness_reward_func/std": 0.5149286985397339, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 898 + }, + { + "completion_length": 921.7500610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1993.0, + "completions/max_terminated_length": 1993.0, + "completions/mean_length": 921.75, + "completions/mean_terminated_length": 921.75, + "completions/min_length": 423.0, + "completions/min_terminated_length": 423.0, + "epoch": 0.3049525101763908, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.2145842137888394e-07, + "kl": 0.0, + "learning_rate": 3.536922015182884e-07, + "loss": 0.0, + "num_tokens": 30472606.0, + "reward": 1.2000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 899 + }, + { + "completion_length": 2026.166748046875, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5969.0, + "completions/mean_length": 2575.25, + "completions/mean_terminated_length": 2210.36376953125, + "completions/min_length": 546.0, + "completions/min_terminated_length": 546.0, + "epoch": 0.30529172320217096, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.5869353413581848, + "kl": NaN, + "learning_rate": 3.5351966873706003e-07, + "loss": -0.0101, + "num_tokens": 30508272.0, + "reward": 0.9916666746139526, + "reward_std": 0.2727941870689392, + "rewards/correctness_reward_func/mean": 0.7166666984558105, + "rewards/correctness_reward_func/std": 0.4386618733406067, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.08660254627466202, + "step": 900 + }, + { + "completion_length": 2621.33349609375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4830.0, + "completions/max_terminated_length": 4830.0, + "completions/mean_length": 2621.33349609375, + "completions/mean_terminated_length": 2621.33349609375, + "completions/min_length": 616.0, + "completions/min_terminated_length": 616.0, + "epoch": 0.3056309362279512, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.387363906564133e-07, + "kl": 0.0, + "learning_rate": 3.533471359558316e-07, + "loss": 0.0, + "num_tokens": 30554278.0, + "reward": 0.7000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 901 + }, + { + "completion_length": 1101.6666870117188, + "completions/clipped_ratio": 0.33333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5642.0, + "completions/mean_length": 3298.0, + "completions/mean_terminated_length": 1652.5, + "completions/min_length": 462.0, + "completions/min_terminated_length": 462.0, + "epoch": 0.30597014925373134, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21892796456813812, + "kl": NaN, + "learning_rate": 3.531746031746032e-07, + "loss": -0.0191, + "num_tokens": 30574698.0, + "reward": 0.7083333730697632, + "reward_std": 0.10790684819221497, + "rewards/correctness_reward_func/mean": 0.4833333492279053, + "rewards/correctness_reward_func/std": 0.5078176856040955, + "rewards/format_reward_func/mean": 0.22500000894069672, + "rewards/format_reward_func/std": 0.11965861916542053, + "step": 902 + }, + { + "completion_length": 1977.3333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3584.0, + "completions/max_terminated_length": 3584.0, + "completions/mean_length": 1977.3333740234375, + "completions/mean_terminated_length": 1977.3333740234375, + "completions/min_length": 894.0, + "completions/min_terminated_length": 894.0, + "epoch": 0.30630936227951155, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15072372555732727, + "kl": 0.0, + "learning_rate": 3.530020703933747e-07, + "loss": -0.0001, + "num_tokens": 30611248.0, + "reward": 1.2000000476837158, + "reward_std": 0.10327951610088348, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 903 + }, + { + "completion_length": 1630.7501220703125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3331.0, + "completions/max_terminated_length": 3331.0, + "completions/mean_length": 1630.75, + "completions/mean_terminated_length": 1630.75, + "completions/min_length": 589.0, + "completions/min_terminated_length": 589.0, + "epoch": 0.3066485753052917, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.10795415192842484, + "kl": 0.0, + "learning_rate": 3.528295376121463e-07, + "loss": 0.0029, + "num_tokens": 30643285.0, + "reward": 1.183333396911621, + "reward_std": 0.040824808180332184, + "rewards/correctness_reward_func/mean": 0.8833333849906921, + "rewards/correctness_reward_func/std": 0.10298572480678558, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 904 + }, + { + "completion_length": 1505.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2986.0, + "completions/max_terminated_length": 2986.0, + "completions/mean_length": 1505.25, + "completions/mean_terminated_length": 1505.25, + "completions/min_length": 497.0, + "completions/min_terminated_length": 497.0, + "epoch": 0.30698778833107193, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.32887357473373413, + "kl": 0.0, + "learning_rate": 3.5265700483091785e-07, + "loss": 0.0141, + "num_tokens": 30672184.0, + "reward": 0.7666667699813843, + "reward_std": 0.1632993221282959, + "rewards/correctness_reward_func/mean": 0.46666669845581055, + "rewards/correctness_reward_func/std": 0.4119429290294647, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 905 + }, + { + "completion_length": 1593.0833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3868.0, + "completions/max_terminated_length": 3868.0, + "completions/mean_length": 1593.0833740234375, + "completions/mean_terminated_length": 1593.0833740234375, + "completions/min_length": 582.0, + "completions/min_terminated_length": 582.0, + "epoch": 0.3073270013568521, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.7148328423500061, + "kl": 0.0, + "learning_rate": 3.5248447204968945e-07, + "loss": -0.0257, + "num_tokens": 30698309.0, + "reward": 0.7833334803581238, + "reward_std": 0.20412413775920868, + "rewards/correctness_reward_func/mean": 0.4833333194255829, + "rewards/correctness_reward_func/std": 0.43029236793518066, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 906 + }, + { + "completion_length": 1990.8333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4104.0, + "completions/max_terminated_length": 4104.0, + "completions/mean_length": 1990.8333740234375, + "completions/mean_terminated_length": 1990.8333740234375, + "completions/min_length": 985.0, + "completions/min_terminated_length": 985.0, + "epoch": 0.3076662143826323, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0934876799583435, + "kl": 0.0, + "learning_rate": 3.5231193926846095e-07, + "loss": -0.0055, + "num_tokens": 30735015.0, + "reward": 1.1166667938232422, + "reward_std": 0.040824826806783676, + "rewards/correctness_reward_func/mean": 0.8166666626930237, + "rewards/correctness_reward_func/std": 0.05773502215743065, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 907 + }, + { + "completion_length": 1272.166748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3085.0, + "completions/max_terminated_length": 3085.0, + "completions/mean_length": 1272.166748046875, + "completions/mean_terminated_length": 1272.166748046875, + "completions/min_length": 672.0, + "completions/min_terminated_length": 672.0, + "epoch": 0.30800542740841247, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0728154107928276, + "kl": 0.0, + "learning_rate": 3.5213940648723256e-07, + "loss": -0.0013, + "num_tokens": 30765851.0, + "reward": 1.2666666507720947, + "reward_std": 0.05163975805044174, + "rewards/correctness_reward_func/mean": 0.9666666984558105, + "rewards/correctness_reward_func/std": 0.07784988731145859, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 908 + }, + { + "completion_length": 1949.5834350585938, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5665.0, + "completions/mean_length": 2498.666748046875, + "completions/mean_terminated_length": 2126.818359375, + "completions/min_length": 812.0, + "completions/min_terminated_length": 812.0, + "epoch": 0.3083446404341927, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.2722416818141937, + "kl": NaN, + "learning_rate": 3.5196687370600417e-07, + "loss": -0.0216, + "num_tokens": 30799218.0, + "reward": 1.008333444595337, + "reward_std": 0.22453658282756805, + "rewards/correctness_reward_func/mean": 0.7333333492279053, + "rewards/correctness_reward_func/std": 0.23094011843204498, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.08660253882408142, + "step": 909 + }, + { + "completion_length": 2033.3333740234375, + "completions/clipped_ratio": 0.16666666666666663, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 6482.0, + "completions/mean_length": 3131.5, + "completions/mean_terminated_length": 2440.0, + "completions/min_length": 1208.0, + "completions/min_terminated_length": 1208.0, + "epoch": 0.30868385345997285, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.22964391112327576, + "kl": NaN, + "learning_rate": 3.5179434092477567e-07, + "loss": -0.0203, + "num_tokens": 30830872.0, + "reward": 0.25, + "reward_std": 0.0774596780538559, + "rewards/correctness_reward_func/mean": 0.0, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.25, + "rewards/format_reward_func/std": 0.11677484214305878, + "step": 910 + }, + { + "completion_length": 1930.0833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3964.0, + "completions/max_terminated_length": 3964.0, + "completions/mean_length": 1930.0833740234375, + "completions/mean_terminated_length": 1930.0833740234375, + "completions/min_length": 728.0, + "completions/min_terminated_length": 728.0, + "epoch": 0.30902306648575306, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0788058415055275, + "kl": 0.0, + "learning_rate": 3.5162180814354727e-07, + "loss": 0.0019, + "num_tokens": 30866993.0, + "reward": 1.2166666984558105, + "reward_std": 0.040824826806783676, + "rewards/correctness_reward_func/mean": 0.9166666865348816, + "rewards/correctness_reward_func/std": 0.10298573225736618, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 911 + }, + { + "completion_length": 1843.0000610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5078.0, + "completions/max_terminated_length": 5078.0, + "completions/mean_length": 1843.0, + "completions/mean_terminated_length": 1843.0, + "completions/min_length": 720.0, + "completions/min_terminated_length": 720.0, + "epoch": 0.3093622795115332, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.17806807160377502, + "kl": 0.0, + "learning_rate": 3.514492753623188e-07, + "loss": -0.0043, + "num_tokens": 30903971.0, + "reward": 1.2166666984558105, + "reward_std": 0.10641197860240936, + "rewards/correctness_reward_func/mean": 0.9166666865348816, + "rewards/correctness_reward_func/std": 0.10298572480678558, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 912 + }, + { + "completion_length": 1276.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1647.0, + "completions/max_terminated_length": 1647.0, + "completions/mean_length": 1276.75, + "completions/mean_terminated_length": 1276.75, + "completions/min_length": 797.0, + "completions/min_terminated_length": 797.0, + "epoch": 0.30970149253731344, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.0859728405421265e-07, + "kl": 0.0, + "learning_rate": 3.5127674258109043e-07, + "loss": 0.0, + "num_tokens": 30936494.0, + "reward": 0.7000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 913 + }, + { + "completion_length": 1329.7500610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2402.0, + "completions/max_terminated_length": 2402.0, + "completions/mean_length": 1329.75, + "completions/mean_terminated_length": 1329.75, + "completions/min_length": 508.0, + "completions/min_terminated_length": 508.0, + "epoch": 0.3100407055630936, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.0788388610526454e-07, + "kl": 0.0, + "learning_rate": 3.5110420979986193e-07, + "loss": 0.0, + "num_tokens": 30962819.0, + "reward": 1.1000001430511475, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 914 + }, + { + "completion_length": 941.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1229.0, + "completions/max_terminated_length": 1229.0, + "completions/mean_length": 941.0, + "completions/mean_terminated_length": 941.0, + "completions/min_length": 401.0, + "completions/min_terminated_length": 401.0, + "epoch": 0.3103799185888738, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.0740662983535e-08, + "kl": 0.0, + "learning_rate": 3.5093167701863354e-07, + "loss": 0.0, + "num_tokens": 30982991.0, + "reward": 0.7000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 915 + }, + { + "completion_length": 1847.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2928.0, + "completions/max_terminated_length": 2928.0, + "completions/mean_length": 1847.5, + "completions/mean_terminated_length": 1847.5, + "completions/min_length": 1272.0, + "completions/min_terminated_length": 1272.0, + "epoch": 0.310719131614654, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5373735427856445, + "kl": 0.0, + "learning_rate": 3.507591442374051e-07, + "loss": 0.0057, + "num_tokens": 31018577.0, + "reward": 0.44999998807907104, + "reward_std": 0.36742347478866577, + "rewards/correctness_reward_func/mean": 0.14999999105930328, + "rewards/correctness_reward_func/std": 0.35290998220443726, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 916 + }, + { + "completion_length": 1876.0000610351562, + "completions/clipped_ratio": 0.16666666666666663, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5651.0, + "completions/mean_length": 2974.166748046875, + "completions/mean_terminated_length": 2251.199951171875, + "completions/min_length": 555.0, + "completions/min_terminated_length": 555.0, + "epoch": 0.3110583446404342, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8309308290481567, + "kl": NaN, + "learning_rate": 3.505866114561767e-07, + "loss": -0.0607, + "num_tokens": 31055465.0, + "reward": 0.5833333730697632, + "reward_std": 0.4858439564704895, + "rewards/correctness_reward_func/mean": 0.3333333432674408, + "rewards/correctness_reward_func/std": 0.4119429290294647, + "rewards/format_reward_func/mean": 0.2500000298023224, + "rewards/format_reward_func/std": 0.11677484214305878, + "step": 917 + }, + { + "completion_length": 1174.9166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2559.0, + "completions/max_terminated_length": 2559.0, + "completions/mean_length": 1174.916748046875, + "completions/mean_terminated_length": 1174.916748046875, + "completions/min_length": 688.0, + "completions/min_terminated_length": 688.0, + "epoch": 0.3113975576662144, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.10468626022338867, + "kl": 0.0, + "learning_rate": 3.504140786749482e-07, + "loss": -0.0008, + "num_tokens": 31078678.0, + "reward": 1.1666667461395264, + "reward_std": 0.05163975805044174, + "rewards/correctness_reward_func/mean": 0.8666666150093079, + "rewards/correctness_reward_func/std": 0.0984731912612915, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 918 + }, + { + "completion_length": 1952.4166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5511.0, + "completions/max_terminated_length": 5511.0, + "completions/mean_length": 1952.416748046875, + "completions/mean_terminated_length": 1952.416748046875, + "completions/min_length": 861.0, + "completions/min_terminated_length": 861.0, + "epoch": 0.31173677069199457, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.5677090287208557, + "kl": 0.0, + "learning_rate": 3.502415458937198e-07, + "loss": -0.0326, + "num_tokens": 31110243.0, + "reward": 0.5166666507720947, + "reward_std": 0.24013885855674744, + "rewards/correctness_reward_func/mean": 0.21666665375232697, + "rewards/correctness_reward_func/std": 0.39504507184028625, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 919 + }, + { + "completion_length": 813.5000305175781, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1229.0, + "completions/max_terminated_length": 1229.0, + "completions/mean_length": 813.5, + "completions/mean_terminated_length": 813.5, + "completions/min_length": 620.0, + "completions/min_terminated_length": 620.0, + "epoch": 0.3120759837177748, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.06546156108379364, + "kl": 0.0, + "learning_rate": 3.5006901311249135e-07, + "loss": 0.001, + "num_tokens": 31134165.0, + "reward": 0.7833333015441895, + "reward_std": 0.040824808180332184, + "rewards/correctness_reward_func/mean": 0.4833333492279053, + "rewards/correctness_reward_func/std": 0.5078176856040955, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 920 + }, + { + "completion_length": 829.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2224.0, + "completions/max_terminated_length": 2224.0, + "completions/mean_length": 829.5, + "completions/mean_terminated_length": 829.5, + "completions/min_length": 410.0, + "completions/min_terminated_length": 410.0, + "epoch": 0.31241519674355495, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0700530856847763, + "kl": 0.0, + "learning_rate": 3.498964803312629e-07, + "loss": -0.003, + "num_tokens": 31155189.0, + "reward": 1.2166666984558105, + "reward_std": 0.040824826806783676, + "rewards/correctness_reward_func/mean": 0.9166666865348816, + "rewards/correctness_reward_func/std": 0.10298572480678558, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 921 + }, + { + "completion_length": 1294.1666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2378.0, + "completions/max_terminated_length": 2378.0, + "completions/mean_length": 1294.166748046875, + "completions/mean_terminated_length": 1294.166748046875, + "completions/min_length": 536.0, + "completions/min_terminated_length": 536.0, + "epoch": 0.31275440976933516, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1272267997264862, + "kl": 0.0, + "learning_rate": 3.4972394755003446e-07, + "loss": -0.002, + "num_tokens": 31176545.0, + "reward": 1.2000000476837158, + "reward_std": 0.08164963126182556, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 922 + }, + { + "completion_length": 1220.9166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1935.0, + "completions/max_terminated_length": 1935.0, + "completions/mean_length": 1220.916748046875, + "completions/mean_terminated_length": 1220.916748046875, + "completions/min_length": 633.0, + "completions/min_terminated_length": 633.0, + "epoch": 0.3130936227951153, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.055474903434515, + "kl": 0.0, + "learning_rate": 3.4955141476880606e-07, + "loss": 0.0003, + "num_tokens": 31205242.0, + "reward": 1.1166667938232422, + "reward_std": 0.040824826806783676, + "rewards/correctness_reward_func/mean": 0.8166666626930237, + "rewards/correctness_reward_func/std": 0.05773502215743065, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 923 + }, + { + "completion_length": 1090.0833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2956.0, + "completions/max_terminated_length": 2956.0, + "completions/mean_length": 1090.0833740234375, + "completions/mean_terminated_length": 1090.0833740234375, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "epoch": 0.31343283582089554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 3.4937888198757767e-07, + "loss": 0.0, + "num_tokens": 31232585.0, + "reward": 1.2999999523162842, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 1.0, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 924 + }, + { + "completion_length": 1138.3333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4259.0, + "completions/max_terminated_length": 4259.0, + "completions/mean_length": 1138.3333740234375, + "completions/mean_terminated_length": 1138.3333740234375, + "completions/min_length": 344.0, + "completions/min_terminated_length": 344.0, + "epoch": 0.3137720488466757, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.37474891543388367, + "kl": 0.0, + "learning_rate": 3.4920634920634917e-07, + "loss": -0.0011, + "num_tokens": 31258053.0, + "reward": 1.1166667938232422, + "reward_std": 0.20412413775920868, + "rewards/correctness_reward_func/mean": 0.8166666030883789, + "rewards/correctness_reward_func/std": 0.27579087018966675, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 925 + }, + { + "completion_length": 977.2500610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2604.0, + "completions/max_terminated_length": 2604.0, + "completions/mean_length": 977.25, + "completions/mean_terminated_length": 977.25, + "completions/min_length": 443.0, + "completions/min_terminated_length": 443.0, + "epoch": 0.3141112618724559, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.163004222045856e-08, + "kl": 0.0, + "learning_rate": 3.490338164251208e-07, + "loss": 0.0, + "num_tokens": 31285440.0, + "reward": 1.2000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 926 + }, + { + "completion_length": 1147.0000610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1901.0, + "completions/max_terminated_length": 1901.0, + "completions/mean_length": 1147.0, + "completions/mean_terminated_length": 1147.0, + "completions/min_length": 658.0, + "completions/min_terminated_length": 658.0, + "epoch": 0.3144504748982361, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.1602305249880374e-07, + "kl": 0.0, + "learning_rate": 3.4886128364389233e-07, + "loss": 0.0, + "num_tokens": 31308624.0, + "reward": 1.2000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 927 + }, + { + "completion_length": 745.0833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1572.0, + "completions/max_terminated_length": 1572.0, + "completions/mean_length": 745.0833740234375, + "completions/mean_terminated_length": 745.0833740234375, + "completions/min_length": 447.0, + "completions/min_terminated_length": 447.0, + "epoch": 0.3147896879240163, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.0053457799585885e-07, + "kl": 0.0, + "learning_rate": 3.4868875086266393e-07, + "loss": 0.0, + "num_tokens": 31327321.0, + "reward": 0.7000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 928 + }, + { + "completion_length": 883.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1468.0, + "completions/max_terminated_length": 1468.0, + "completions/mean_length": 883.0, + "completions/mean_terminated_length": 883.0, + "completions/min_length": 430.0, + "completions/min_terminated_length": 430.0, + "epoch": 0.31512890094979645, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 3.4851621808143543e-07, + "loss": 0.0, + "num_tokens": 31351351.0, + "reward": 0.7999999523162842, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.5, + "rewards/correctness_reward_func/std": 0.5222329497337341, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 929 + }, + { + "completion_length": 1326.5833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3650.0, + "completions/max_terminated_length": 3650.0, + "completions/mean_length": 1326.5833740234375, + "completions/mean_terminated_length": 1326.5833740234375, + "completions/min_length": 441.0, + "completions/min_terminated_length": 441.0, + "epoch": 0.31546811397557667, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.2548946440219879, + "kl": 0.0, + "learning_rate": 3.4834368530020704e-07, + "loss": 0.0018, + "num_tokens": 31381022.0, + "reward": 1.0500000715255737, + "reward_std": 0.17606817185878754, + "rewards/correctness_reward_func/mean": 0.75, + "rewards/correctness_reward_func/std": 0.24308621883392334, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 930 + }, + { + "completion_length": 4050.166748046875, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 6140.0, + "completions/mean_length": 4599.25, + "completions/mean_terminated_length": 4418.36376953125, + "completions/min_length": 1481.0, + "completions/min_terminated_length": 1481.0, + "epoch": 0.31580732700135683, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.3112356662750244, + "kl": NaN, + "learning_rate": 3.481711525189786e-07, + "loss": -0.0497, + "num_tokens": 31438420.0, + "reward": 0.6791666746139526, + "reward_std": 0.2609677314758301, + "rewards/correctness_reward_func/mean": 0.4166666567325592, + "rewards/correctness_reward_func/std": 0.5149286985397339, + "rewards/format_reward_func/mean": 0.26250001788139343, + "rewards/format_reward_func/std": 0.09323723614215851, + "step": 931 + }, + { + "completion_length": 1440.4166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2638.0, + "completions/max_terminated_length": 2638.0, + "completions/mean_length": 1440.416748046875, + "completions/mean_terminated_length": 1440.416748046875, + "completions/min_length": 912.0, + "completions/min_terminated_length": 912.0, + "epoch": 0.31614654002713705, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.07152718305587769, + "kl": 0.0, + "learning_rate": 3.4799861973775015e-07, + "loss": -0.0004, + "num_tokens": 31466607.0, + "reward": 1.183333396911621, + "reward_std": 0.040824808180332184, + "rewards/correctness_reward_func/mean": 0.8833333849906921, + "rewards/correctness_reward_func/std": 0.10298572480678558, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 932 + }, + { + "completion_length": 913.1666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1962.0, + "completions/max_terminated_length": 1962.0, + "completions/mean_length": 913.1666870117188, + "completions/mean_terminated_length": 913.1666870117188, + "completions/min_length": 351.0, + "completions/min_terminated_length": 351.0, + "epoch": 0.3164857530529172, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.03750376030802727, + "kl": 0.0, + "learning_rate": 3.478260869565217e-07, + "loss": 0.0, + "num_tokens": 31491683.0, + "reward": 0.7833333015441895, + "reward_std": 0.040824808180332184, + "rewards/correctness_reward_func/mean": 0.4833333492279053, + "rewards/correctness_reward_func/std": 0.5078176856040955, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 933 + }, + { + "completion_length": 615.5833587646484, + "completions/clipped_ratio": 0.5, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 1752.0, + "completions/mean_length": 3910.08349609375, + "completions/mean_terminated_length": 1231.166748046875, + "completions/min_length": 780.0, + "completions/min_terminated_length": 780.0, + "epoch": 0.3168249660786974, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.052773211151361465, + "kl": NaN, + "learning_rate": 3.476535541752933e-07, + "loss": -0.0011, + "num_tokens": 31510014.0, + "reward": 0.6333333253860474, + "reward_std": 0.040824808180332184, + "rewards/correctness_reward_func/mean": 0.4833333492279053, + "rewards/correctness_reward_func/std": 0.5078176856040955, + "rewards/format_reward_func/mean": 0.15000000596046448, + "rewards/format_reward_func/std": 0.15666989982128143, + "step": 934 + }, + { + "completion_length": 1668.75, + "completions/clipped_ratio": 0.25, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5946.0, + "completions/mean_length": 3316.0, + "completions/mean_terminated_length": 2225.0, + "completions/min_length": 425.0, + "completions/min_terminated_length": 425.0, + "epoch": 0.31716417910447764, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8342804312705994, + "kl": NaN, + "learning_rate": 3.4748102139406486e-07, + "loss": -0.1025, + "num_tokens": 31543809.0, + "reward": 0.845833420753479, + "reward_std": 0.5563273429870605, + "rewards/correctness_reward_func/mean": 0.6333333253860474, + "rewards/correctness_reward_func/std": 0.4735424220561981, + "rewards/format_reward_func/mean": 0.21250002086162567, + "rewards/format_reward_func/std": 0.13505050539970398, + "step": 935 + }, + { + "completion_length": 444.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 594.0, + "completions/max_terminated_length": 594.0, + "completions/mean_length": 444.75, + "completions/mean_terminated_length": 444.75, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 0.3175033921302578, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.06688177585601807, + "kl": 0.0, + "learning_rate": 3.473084886128364e-07, + "loss": -0.0, + "num_tokens": 31563336.0, + "reward": 1.2833333015441895, + "reward_std": 0.040824808180332184, + "rewards/correctness_reward_func/mean": 0.9833333492279053, + "rewards/correctness_reward_func/std": 0.05773502215743065, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 936 + }, + { + "completion_length": 1351.4166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2880.0, + "completions/max_terminated_length": 2880.0, + "completions/mean_length": 1351.416748046875, + "completions/mean_terminated_length": 1351.416748046875, + "completions/min_length": 847.0, + "completions/min_terminated_length": 847.0, + "epoch": 0.317842605156038, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.477865070104599, + "kl": 0.0, + "learning_rate": 3.4713595583160796e-07, + "loss": 0.0164, + "num_tokens": 31591571.0, + "reward": 1.2166666984558105, + "reward_std": 0.20412413775920868, + "rewards/correctness_reward_func/mean": 0.9166666865348816, + "rewards/correctness_reward_func/std": 0.28867512941360474, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 937 + }, + { + "completion_length": 2661.166748046875, + "completions/clipped_ratio": 0.25, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5988.0, + "completions/mean_length": 4308.4169921875, + "completions/mean_terminated_length": 3548.22216796875, + "completions/min_length": 1966.0, + "completions/min_terminated_length": 1966.0, + "epoch": 0.3181818181818182, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4337899684906006, + "kl": NaN, + "learning_rate": 3.4696342305037957e-07, + "loss": -0.0686, + "num_tokens": 31636261.0, + "reward": 0.7083333730697632, + "reward_std": 0.2518266439437866, + "rewards/correctness_reward_func/mean": 0.4833333194255829, + "rewards/correctness_reward_func/std": 0.43029236793518066, + "rewards/format_reward_func/mean": 0.22500000894069672, + "rewards/format_reward_func/std": 0.13568010926246643, + "step": 938 + }, + { + "completion_length": 688.7500305175781, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1200.0, + "completions/max_terminated_length": 1200.0, + "completions/mean_length": 688.75, + "completions/mean_terminated_length": 688.75, + "completions/min_length": 366.0, + "completions/min_terminated_length": 366.0, + "epoch": 0.3185210312075984, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09581376612186432, + "kl": 0.0, + "learning_rate": 3.467908902691512e-07, + "loss": -0.0006, + "num_tokens": 31659796.0, + "reward": 1.2166666984558105, + "reward_std": 0.09246456623077393, + "rewards/correctness_reward_func/mean": 0.9166666865348816, + "rewards/correctness_reward_func/std": 0.10298572480678558, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 939 + }, + { + "completion_length": 1968.166748046875, + "completions/clipped_ratio": 0.25, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5840.0, + "completions/mean_length": 3615.416748046875, + "completions/mean_terminated_length": 2624.22216796875, + "completions/min_length": 1067.0, + "completions/min_terminated_length": 1067.0, + "epoch": 0.31886024423337855, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.884489893913269, + "kl": NaN, + "learning_rate": 3.466183574879227e-07, + "loss": -0.1014, + "num_tokens": 31698522.0, + "reward": 0.5916666984558105, + "reward_std": 0.5406736135482788, + "rewards/correctness_reward_func/mean": 0.36666667461395264, + "rewards/correctness_reward_func/std": 0.45792683959007263, + "rewards/format_reward_func/mean": 0.2250000238418579, + "rewards/format_reward_func/std": 0.13568010926246643, + "step": 940 + }, + { + "completion_length": 1261.1666870117188, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 2849.0, + "completions/mean_length": 1810.25, + "completions/mean_terminated_length": 1375.8182373046875, + "completions/min_length": 785.0, + "completions/min_terminated_length": 785.0, + "epoch": 0.31919945725915877, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.24075216054916382, + "kl": NaN, + "learning_rate": 3.464458247066943e-07, + "loss": -0.0164, + "num_tokens": 31727606.0, + "reward": 1.1083333492279053, + "reward_std": 0.22453658282756805, + "rewards/correctness_reward_func/mean": 0.8333333134651184, + "rewards/correctness_reward_func/std": 0.2806917726993561, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.08660253882408142, + "step": 941 + }, + { + "completion_length": 985.2500305175781, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1892.0, + "completions/max_terminated_length": 1892.0, + "completions/mean_length": 985.25, + "completions/mean_terminated_length": 985.25, + "completions/min_length": 441.0, + "completions/min_terminated_length": 441.0, + "epoch": 0.31953867028493893, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 3.4627329192546583e-07, + "loss": 0.0, + "num_tokens": 31750769.0, + "reward": 0.7999999523162842, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.5, + "rewards/correctness_reward_func/std": 0.5222329497337341, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 942 + }, + { + "completion_length": 887.9166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1810.0, + "completions/max_terminated_length": 1810.0, + "completions/mean_length": 887.9166870117188, + "completions/mean_terminated_length": 887.9166870117188, + "completions/min_length": 538.0, + "completions/min_terminated_length": 538.0, + "epoch": 0.31987788331071915, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.1579033671059733e-07, + "kl": 0.0, + "learning_rate": 3.461007591442374e-07, + "loss": 0.0, + "num_tokens": 31775662.0, + "reward": 1.1000001430511475, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 943 + }, + { + "completion_length": 1304.5000610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3547.0, + "completions/max_terminated_length": 3547.0, + "completions/mean_length": 1304.5, + "completions/mean_terminated_length": 1304.5, + "completions/min_length": 528.0, + "completions/min_terminated_length": 528.0, + "epoch": 0.3202170963364993, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.3239888946591236e-07, + "kl": 0.0, + "learning_rate": 3.4592822636300894e-07, + "loss": 0.0, + "num_tokens": 31795600.0, + "reward": 1.1000001430511475, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 944 + }, + { + "completion_length": 697.7500305175781, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1262.0, + "completions/max_terminated_length": 1262.0, + "completions/mean_length": 697.75, + "completions/mean_terminated_length": 697.75, + "completions/min_length": 494.0, + "completions/min_terminated_length": 494.0, + "epoch": 0.3205563093622795, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.2733204960823059, + "kl": 0.0, + "learning_rate": 3.4575569358178054e-07, + "loss": -0.0005, + "num_tokens": 31818679.0, + "reward": 1.2000000476837158, + "reward_std": 0.19999998807907104, + "rewards/correctness_reward_func/mean": 0.9000000357627869, + "rewards/correctness_reward_func/std": 0.28919950127601624, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 945 + }, + { + "completion_length": 2257.416748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6290.0, + "completions/max_terminated_length": 6290.0, + "completions/mean_length": 2257.416748046875, + "completions/mean_terminated_length": 2257.416748046875, + "completions/min_length": 316.0, + "completions/min_terminated_length": 316.0, + "epoch": 0.3208955223880597, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6494872570037842, + "kl": 0.0, + "learning_rate": 3.455831608005521e-07, + "loss": -0.0207, + "num_tokens": 31862034.0, + "reward": 0.7833333015441895, + "reward_std": 0.36742347478866577, + "rewards/correctness_reward_func/mean": 0.4833333492279053, + "rewards/correctness_reward_func/std": 0.5078176856040955, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 946 + }, + { + "completion_length": 1796.0000915527344, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 4971.0, + "completions/mean_length": 2345.08349609375, + "completions/mean_terminated_length": 1959.2728271484375, + "completions/min_length": 595.0, + "completions/min_terminated_length": 595.0, + "epoch": 0.3212347354138399, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12197453528642654, + "kl": NaN, + "learning_rate": 3.4541062801932365e-07, + "loss": -0.0085, + "num_tokens": 31895976.0, + "reward": 0.7458333373069763, + "reward_std": 0.10710843652486801, + "rewards/correctness_reward_func/mean": 0.4833333492279053, + "rewards/correctness_reward_func/std": 0.5078176856040955, + "rewards/format_reward_func/mean": 0.26250001788139343, + "rewards/format_reward_func/std": 0.09323723614215851, + "step": 947 + }, + { + "completion_length": 608.1666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1151.0, + "completions/max_terminated_length": 1151.0, + "completions/mean_length": 608.1666870117188, + "completions/mean_terminated_length": 608.1666870117188, + "completions/min_length": 380.0, + "completions/min_terminated_length": 380.0, + "epoch": 0.32157394843962006, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.0008302098185595e-07, + "kl": 0.0, + "learning_rate": 3.452380952380952e-07, + "loss": 0.0, + "num_tokens": 31916744.0, + "reward": 0.7000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 948 + }, + { + "completion_length": 878.2500610351562, + "completions/clipped_ratio": 0.41666666666666663, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 4383.0, + "completions/mean_length": 3623.666748046875, + "completions/mean_terminated_length": 1505.571533203125, + "completions/min_length": 724.0, + "completions/min_terminated_length": 724.0, + "epoch": 0.3219131614654003, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.11442892253398895, + "kl": NaN, + "learning_rate": 3.450655624568668e-07, + "loss": -0.0083, + "num_tokens": 31940219.0, + "reward": 0.7250000238418579, + "reward_std": 0.06708204001188278, + "rewards/correctness_reward_func/mean": 0.5, + "rewards/correctness_reward_func/std": 0.5222329497337341, + "rewards/format_reward_func/mean": 0.2250000238418579, + "rewards/format_reward_func/std": 0.11965861171483994, + "step": 949 + }, + { + "completion_length": 1511.6666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3411.0, + "completions/max_terminated_length": 3411.0, + "completions/mean_length": 1511.666748046875, + "completions/mean_terminated_length": 1511.666748046875, + "completions/min_length": 705.0, + "completions/min_terminated_length": 705.0, + "epoch": 0.32225237449118044, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.4038799633908638e-07, + "kl": 0.0, + "learning_rate": 3.4489302967563836e-07, + "loss": 0.0, + "num_tokens": 31968571.0, + "reward": 0.7000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 950 + }, + { + "completion_length": 1021.1667175292969, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2335.0, + "completions/max_terminated_length": 2335.0, + "completions/mean_length": 1021.1666870117188, + "completions/mean_terminated_length": 1021.1666870117188, + "completions/min_length": 677.0, + "completions/min_terminated_length": 677.0, + "epoch": 0.32259158751696065, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 3.447204968944099e-07, + "loss": 0.0, + "num_tokens": 31991547.0, + "reward": 0.7999999523162842, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.5, + "rewards/correctness_reward_func/std": 0.5222329497337341, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 951 + }, + { + "completion_length": 739.6666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1009.0, + "completions/max_terminated_length": 1009.0, + "completions/mean_length": 739.6666870117188, + "completions/mean_terminated_length": 739.6666870117188, + "completions/min_length": 436.0, + "completions/min_terminated_length": 436.0, + "epoch": 0.3229308005427408, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.2268769467027596e-07, + "kl": 0.0, + "learning_rate": 3.4454796411318147e-07, + "loss": 0.0, + "num_tokens": 32012159.0, + "reward": 0.7000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 952 + }, + { + "completion_length": 1022.5000610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2877.0, + "completions/max_terminated_length": 2877.0, + "completions/mean_length": 1022.5, + "completions/mean_terminated_length": 1022.5, + "completions/min_length": 495.0, + "completions/min_terminated_length": 495.0, + "epoch": 0.32327001356852103, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.05467428267002106, + "kl": 0.0, + "learning_rate": 3.4437543133195307e-07, + "loss": -0.0002, + "num_tokens": 32035457.0, + "reward": 1.183333396911621, + "reward_std": 0.040824808180332184, + "rewards/correctness_reward_func/mean": 0.8833333849906921, + "rewards/correctness_reward_func/std": 0.10298572480678558, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 953 + }, + { + "completion_length": 2420.4166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5646.0, + "completions/max_terminated_length": 5646.0, + "completions/mean_length": 2420.416748046875, + "completions/mean_terminated_length": 2420.416748046875, + "completions/min_length": 400.0, + "completions/min_terminated_length": 400.0, + "epoch": 0.32360922659430125, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.4149426221847534, + "kl": 0.0, + "learning_rate": 3.4420289855072457e-07, + "loss": -0.0127, + "num_tokens": 32074594.0, + "reward": 0.7541667819023132, + "reward_std": 0.13268069922924042, + "rewards/correctness_reward_func/mean": 0.46666666865348816, + "rewards/correctness_reward_func/std": 0.4119429290294647, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 954 + }, + { + "completion_length": 832.0833435058594, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1620.0, + "completions/max_terminated_length": 1620.0, + "completions/mean_length": 832.0833740234375, + "completions/mean_terminated_length": 832.0833740234375, + "completions/min_length": 480.0, + "completions/min_terminated_length": 480.0, + "epoch": 0.3239484396200814, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.08572785556316376, + "kl": 0.0, + "learning_rate": 3.440303657694962e-07, + "loss": 0.0007, + "num_tokens": 32092451.0, + "reward": 1.2666666507720947, + "reward_std": 0.051639750599861145, + "rewards/correctness_reward_func/mean": 0.9666666984558105, + "rewards/correctness_reward_func/std": 0.07784988731145859, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 955 + }, + { + "completion_length": 1114.4166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1610.0, + "completions/max_terminated_length": 1610.0, + "completions/mean_length": 1114.416748046875, + "completions/mean_terminated_length": 1114.416748046875, + "completions/min_length": 749.0, + "completions/min_terminated_length": 749.0, + "epoch": 0.3242876526458616, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.0851454962667049e-07, + "kl": 0.0, + "learning_rate": 3.438578329882678e-07, + "loss": 0.0, + "num_tokens": 32116522.0, + "reward": 1.1000001430511475, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 956 + }, + { + "completion_length": 2551.08349609375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3836.0, + "completions/max_terminated_length": 3836.0, + "completions/mean_length": 2551.08349609375, + "completions/mean_terminated_length": 2551.08349609375, + "completions/min_length": 1350.0, + "completions/min_terminated_length": 1350.0, + "epoch": 0.3246268656716418, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6476059556007385, + "kl": 0.0, + "learning_rate": 3.4368530020703934e-07, + "loss": -0.0199, + "num_tokens": 32160011.0, + "reward": 0.7666666507720947, + "reward_std": 0.36329931020736694, + "rewards/correctness_reward_func/mean": 0.46666666865348816, + "rewards/correctness_reward_func/std": 0.4923659563064575, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 957 + }, + { + "completion_length": 2170.5834350585938, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4740.0, + "completions/max_terminated_length": 4740.0, + "completions/mean_length": 2170.58349609375, + "completions/mean_terminated_length": 2170.58349609375, + "completions/min_length": 642.0, + "completions/min_terminated_length": 642.0, + "epoch": 0.324966078697422, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.45245659351348877, + "kl": 0.0, + "learning_rate": 3.435127674258109e-07, + "loss": -0.0042, + "num_tokens": 32202528.0, + "reward": 0.7666667699813843, + "reward_std": 0.1632993221282959, + "rewards/correctness_reward_func/mean": 0.46666666865348816, + "rewards/correctness_reward_func/std": 0.4119429290294647, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 958 + }, + { + "completion_length": 1702.166748046875, + "completions/clipped_ratio": 0.33333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 4745.0, + "completions/mean_length": 3898.5, + "completions/mean_terminated_length": 2553.25, + "completions/min_length": 811.0, + "completions/min_terminated_length": 811.0, + "epoch": 0.32530529172320216, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7316205501556396, + "kl": NaN, + "learning_rate": 3.4334023464458244e-07, + "loss": -0.0585, + "num_tokens": 32236376.0, + "reward": 0.8000000715255737, + "reward_std": 0.33565855026245117, + "rewards/correctness_reward_func/mean": 0.5999999642372131, + "rewards/correctness_reward_func/std": 0.45126086473464966, + "rewards/format_reward_func/mean": 0.20000000298023224, + "rewards/format_reward_func/std": 0.14770980179309845, + "step": 959 + }, + { + "completion_length": 1313.0833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3487.0, + "completions/max_terminated_length": 3487.0, + "completions/mean_length": 1313.0833740234375, + "completions/mean_terminated_length": 1313.0833740234375, + "completions/min_length": 599.0, + "completions/min_terminated_length": 599.0, + "epoch": 0.3256445047489824, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10751835256814957, + "kl": 0.0, + "learning_rate": 3.4316770186335405e-07, + "loss": 0.001, + "num_tokens": 32263551.0, + "reward": 1.183333396911621, + "reward_std": 0.09246458858251572, + "rewards/correctness_reward_func/mean": 0.8833333849906921, + "rewards/correctness_reward_func/std": 0.10298573225736618, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 960 + }, + { + "completion_length": 672.5000305175781, + "completions/clipped_ratio": 0.0, + "completions/max_length": 919.0, + "completions/max_terminated_length": 919.0, + "completions/mean_length": 672.5, + "completions/mean_terminated_length": 672.5, + "completions/min_length": 425.0, + "completions/min_terminated_length": 425.0, + "epoch": 0.32598371777476254, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.0762104096784242e-07, + "kl": 0.0, + "learning_rate": 3.429951690821256e-07, + "loss": 0.0, + "num_tokens": 32283729.0, + "reward": 1.2000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 961 + }, + { + "completion_length": 1622.7500610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2902.0, + "completions/max_terminated_length": 2902.0, + "completions/mean_length": 1622.75, + "completions/mean_terminated_length": 1622.75, + "completions/min_length": 470.0, + "completions/min_terminated_length": 470.0, + "epoch": 0.32632293080054275, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.08299823850393295, + "kl": 0.0, + "learning_rate": 3.4282263630089715e-07, + "loss": -0.0008, + "num_tokens": 32315124.0, + "reward": 1.1500000953674316, + "reward_std": 0.05477223917841911, + "rewards/correctness_reward_func/mean": 0.8499999642372131, + "rewards/correctness_reward_func/std": 0.09045340120792389, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 962 + }, + { + "completion_length": 2098.0833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3588.0, + "completions/max_terminated_length": 3588.0, + "completions/mean_length": 2098.08349609375, + "completions/mean_terminated_length": 2098.08349609375, + "completions/min_length": 842.0, + "completions/min_terminated_length": 842.0, + "epoch": 0.3266621438263229, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5058161616325378, + "kl": 0.0, + "learning_rate": 3.426501035196687e-07, + "loss": -0.0014, + "num_tokens": 32354557.0, + "reward": 0.8500000834465027, + "reward_std": 0.2557638883590698, + "rewards/correctness_reward_func/mean": 0.550000011920929, + "rewards/correctness_reward_func/std": 0.49082493782043457, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 963 + }, + { + "completion_length": 1694.7500610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3257.0, + "completions/max_terminated_length": 3257.0, + "completions/mean_length": 1694.75, + "completions/mean_terminated_length": 1694.75, + "completions/min_length": 557.0, + "completions/min_terminated_length": 557.0, + "epoch": 0.32700135685210313, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.06396138668060303, + "kl": 0.0, + "learning_rate": 3.424775707384403e-07, + "loss": 0.0003, + "num_tokens": 32387842.0, + "reward": 0.7666666507720947, + "reward_std": 0.05163975805044174, + "rewards/correctness_reward_func/mean": 0.46666666865348816, + "rewards/correctness_reward_func/std": 0.4923659563064575, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 964 + }, + { + "completion_length": 1904.8333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6139.0, + "completions/max_terminated_length": 6139.0, + "completions/mean_length": 1904.8333740234375, + "completions/mean_terminated_length": 1904.8333740234375, + "completions/min_length": 552.0, + "completions/min_terminated_length": 552.0, + "epoch": 0.3273405698778833, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.1909196823835373, + "kl": 0.0, + "learning_rate": 3.423050379572118e-07, + "loss": 0.0087, + "num_tokens": 32422652.0, + "reward": 1.183333396911621, + "reward_std": 0.040824808180332184, + "rewards/correctness_reward_func/mean": 0.8833333849906921, + "rewards/correctness_reward_func/std": 0.10298572480678558, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 965 + }, + { + "completion_length": 1142.4166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2871.0, + "completions/max_terminated_length": 2871.0, + "completions/mean_length": 1142.416748046875, + "completions/mean_terminated_length": 1142.416748046875, + "completions/min_length": 306.0, + "completions/min_terminated_length": 306.0, + "epoch": 0.3276797829036635, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.0000242645655817e-07, + "kl": 0.0, + "learning_rate": 3.421325051759834e-07, + "loss": 0.0, + "num_tokens": 32446513.0, + "reward": 0.7000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 966 + }, + { + "completion_length": 2232.5834350585938, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5776.0, + "completions/mean_length": 2781.666748046875, + "completions/mean_terminated_length": 2435.54541015625, + "completions/min_length": 844.0, + "completions/min_terminated_length": 844.0, + "epoch": 0.32801899592944367, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.3459160327911377, + "kl": NaN, + "learning_rate": 3.4195997239475497e-07, + "loss": -0.0268, + "num_tokens": 32487488.0, + "reward": 1.1541666984558105, + "reward_std": 0.21588000655174255, + "rewards/correctness_reward_func/mean": 0.8666666150093079, + "rewards/correctness_reward_func/std": 0.287096232175827, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 967 + }, + { + "completion_length": 1698.4166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4551.0, + "completions/max_terminated_length": 4551.0, + "completions/mean_length": 1698.416748046875, + "completions/mean_terminated_length": 1698.416748046875, + "completions/min_length": 487.0, + "completions/min_terminated_length": 487.0, + "epoch": 0.3283582089552239, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6486458778381348, + "kl": 0.0, + "learning_rate": 3.417874396135266e-07, + "loss": 0.0035, + "num_tokens": 32516797.0, + "reward": 1.0500000715255737, + "reward_std": 0.299967497587204, + "rewards/correctness_reward_func/mean": 0.75, + "rewards/correctness_reward_func/std": 0.36306774616241455, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 968 + }, + { + "completion_length": 1960.3333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4384.0, + "completions/max_terminated_length": 4384.0, + "completions/mean_length": 1960.3333740234375, + "completions/mean_terminated_length": 1960.3333740234375, + "completions/min_length": 522.0, + "completions/min_terminated_length": 522.0, + "epoch": 0.32869742198100405, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.4522709548473358, + "kl": 0.0, + "learning_rate": 3.416149068322981e-07, + "loss": -0.0257, + "num_tokens": 32551721.0, + "reward": 0.38333332538604736, + "reward_std": 0.20412413775920868, + "rewards/correctness_reward_func/mean": 0.0833333358168602, + "rewards/correctness_reward_func/std": 0.28867512941360474, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 969 + }, + { + "completion_length": 1291.416748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2350.0, + "completions/max_terminated_length": 2350.0, + "completions/mean_length": 1291.416748046875, + "completions/mean_terminated_length": 1291.416748046875, + "completions/min_length": 583.0, + "completions/min_terminated_length": 583.0, + "epoch": 0.32903663500678426, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.5910433530807495, + "kl": 0.0, + "learning_rate": 3.414423740510697e-07, + "loss": 0.0022, + "num_tokens": 32579584.0, + "reward": 1.0333335399627686, + "reward_std": 0.25819891691207886, + "rewards/correctness_reward_func/mean": 0.7333332896232605, + "rewards/correctness_reward_func/std": 0.3550501763820648, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 970 + }, + { + "completion_length": 2047.666748046875, + "completions/clipped_ratio": 0.25, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5423.0, + "completions/mean_length": 3694.916748046875, + "completions/mean_terminated_length": 2730.22216796875, + "completions/min_length": 394.0, + "completions/min_terminated_length": 394.0, + "epoch": 0.3293758480325645, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6515673398971558, + "kl": NaN, + "learning_rate": 3.412698412698413e-07, + "loss": 0.0215, + "num_tokens": 32614278.0, + "reward": 0.3916666507720947, + "reward_std": 0.34035724401474, + "rewards/correctness_reward_func/mean": 0.1666666716337204, + "rewards/correctness_reward_func/std": 0.3892494738101959, + "rewards/format_reward_func/mean": 0.22500000894069672, + "rewards/format_reward_func/std": 0.13568010926246643, + "step": 971 + }, + { + "completion_length": 1830.3333740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4031.0, + "completions/max_terminated_length": 4031.0, + "completions/mean_length": 1830.3333740234375, + "completions/mean_terminated_length": 1830.3333740234375, + "completions/min_length": 1033.0, + "completions/min_terminated_length": 1033.0, + "epoch": 0.32971506105834464, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1503661572933197, + "kl": 0.0, + "learning_rate": 3.4109730848861284e-07, + "loss": 0.0047, + "num_tokens": 32649274.0, + "reward": 1.2166666984558105, + "reward_std": 0.10641199350357056, + "rewards/correctness_reward_func/mean": 0.9166666865348816, + "rewards/correctness_reward_func/std": 0.10298572480678558, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 972 + }, + { + "completion_length": 797.8333435058594, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1650.0, + "completions/max_terminated_length": 1650.0, + "completions/mean_length": 797.8333740234375, + "completions/mean_terminated_length": 797.8333740234375, + "completions/min_length": 454.0, + "completions/min_terminated_length": 454.0, + "epoch": 0.33005427408412485, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.0924046850723244e-07, + "kl": 0.0, + "learning_rate": 3.409247757073844e-07, + "loss": 0.0, + "num_tokens": 32672906.0, + "reward": 1.1000001430511475, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 973 + }, + { + "completion_length": 1648.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4696.0, + "completions/max_terminated_length": 4696.0, + "completions/mean_length": 1648.0, + "completions/mean_terminated_length": 1648.0, + "completions/min_length": 584.0, + "completions/min_terminated_length": 584.0, + "epoch": 0.330393487109905, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 3.4075224292615595e-07, + "loss": 0.0, + "num_tokens": 32709692.0, + "reward": 0.30000001192092896, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.0, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 974 + }, + { + "completion_length": 795.8333435058594, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1302.0, + "completions/max_terminated_length": 1302.0, + "completions/mean_length": 795.8333740234375, + "completions/mean_terminated_length": 795.8333740234375, + "completions/min_length": 529.0, + "completions/min_terminated_length": 529.0, + "epoch": 0.33073270013568523, + "frac_reward_zero_std": 1.0, + "grad_norm": 8.773958626306921e-08, + "kl": 0.0, + "learning_rate": 3.4057971014492755e-07, + "loss": 0.0, + "num_tokens": 32731848.0, + "reward": 0.7000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 975 + }, + { + "completion_length": 2053.666748046875, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 5526.0, + "completions/mean_length": 2602.75, + "completions/mean_terminated_length": 2240.36376953125, + "completions/min_length": 801.0, + "completions/min_terminated_length": 801.0, + "epoch": 0.3310719131614654, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.41066351532936096, + "kl": NaN, + "learning_rate": 3.4040717736369905e-07, + "loss": 0.0118, + "num_tokens": 32770622.0, + "reward": 0.8583333492279053, + "reward_std": 0.28804606199264526, + "rewards/correctness_reward_func/mean": 0.5833333134651184, + "rewards/correctness_reward_func/std": 0.4386619031429291, + "rewards/format_reward_func/mean": 0.2750000059604645, + "rewards/format_reward_func/std": 0.08660253882408142, + "step": 976 + }, + { + "completion_length": 4053.5833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6396.0, + "completions/max_terminated_length": 6396.0, + "completions/mean_length": 4053.58349609375, + "completions/mean_terminated_length": 4053.58349609375, + "completions/min_length": 2075.0, + "completions/min_terminated_length": 2075.0, + "epoch": 0.3314111261872456, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.8918020689161494e-07, + "kl": 0.0, + "learning_rate": 3.4023464458247066e-07, + "loss": 0.0, + "num_tokens": 32829225.0, + "reward": 0.7000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 977 + }, + { + "completion_length": 1234.166748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4258.0, + "completions/max_terminated_length": 4258.0, + "completions/mean_length": 1234.166748046875, + "completions/mean_terminated_length": 1234.166748046875, + "completions/min_length": 605.0, + "completions/min_terminated_length": 605.0, + "epoch": 0.33175033921302577, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6808133125305176, + "kl": 0.0, + "learning_rate": 3.400621118012422e-07, + "loss": 0.026, + "num_tokens": 32852561.0, + "reward": 1.133333444595337, + "reward_std": 0.1632993221282959, + "rewards/correctness_reward_func/mean": 0.8333333134651184, + "rewards/correctness_reward_func/std": 0.2806917726993561, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 978 + }, + { + "completion_length": 2129.4166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3540.0, + "completions/max_terminated_length": 3540.0, + "completions/mean_length": 2129.416748046875, + "completions/mean_terminated_length": 2129.416748046875, + "completions/min_length": 742.0, + "completions/min_terminated_length": 742.0, + "epoch": 0.332089552238806, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.425616116852325e-07, + "kl": 0.0, + "learning_rate": 3.398895790200138e-07, + "loss": 0.0, + "num_tokens": 32892274.0, + "reward": 1.1000001430511475, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 979 + }, + { + "completion_length": 1461.0833740234375, + "completions/clipped_ratio": 0.08333333333333337, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 3444.0, + "completions/mean_length": 2010.166748046875, + "completions/mean_terminated_length": 1593.9091796875, + "completions/min_length": 801.0, + "completions/min_terminated_length": 801.0, + "epoch": 0.33242876526458615, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.22468790411949158, + "kl": NaN, + "learning_rate": 3.397170462387853e-07, + "loss": -0.0282, + "num_tokens": 32921297.0, + "reward": 1.2041666507720947, + "reward_std": 0.23474274575710297, + "rewards/correctness_reward_func/mean": 0.9166666865348816, + "rewards/correctness_reward_func/std": 0.28867512941360474, + "rewards/format_reward_func/mean": 0.2874999940395355, + "rewards/format_reward_func/std": 0.04330127313733101, + "step": 980 + }, + { + "completion_length": 2105.416748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6066.0, + "completions/max_terminated_length": 6066.0, + "completions/mean_length": 2105.416748046875, + "completions/mean_terminated_length": 2105.416748046875, + "completions/min_length": 554.0, + "completions/min_terminated_length": 554.0, + "epoch": 0.33276797829036636, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.466020134761493e-07, + "kl": 0.0, + "learning_rate": 3.395445134575569e-07, + "loss": 0.0, + "num_tokens": 32956858.0, + "reward": 1.1000001430511475, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 981 + }, + { + "completion_length": 1458.416748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3091.0, + "completions/max_terminated_length": 3091.0, + "completions/mean_length": 1458.416748046875, + "completions/mean_terminated_length": 1458.416748046875, + "completions/min_length": 540.0, + "completions/min_terminated_length": 540.0, + "epoch": 0.3331071913161465, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.5790541851856688e-07, + "kl": 0.0, + "learning_rate": 3.393719806763285e-07, + "loss": 0.0, + "num_tokens": 32984493.0, + "reward": 0.7000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 982 + }, + { + "completion_length": 1025.6666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1851.0, + "completions/max_terminated_length": 1851.0, + "completions/mean_length": 1025.666748046875, + "completions/mean_terminated_length": 1025.666748046875, + "completions/min_length": 477.0, + "completions/min_terminated_length": 477.0, + "epoch": 0.33344640434192674, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.08788394927978516, + "kl": 0.0, + "learning_rate": 3.391994478951001e-07, + "loss": 0.0007, + "num_tokens": 33010637.0, + "reward": 1.1166667938232422, + "reward_std": 0.040824826806783676, + "rewards/correctness_reward_func/mean": 0.8166666626930237, + "rewards/correctness_reward_func/std": 0.05773502215743065, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 983 + }, + { + "completion_length": 2536.416748046875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4205.0, + "completions/max_terminated_length": 4205.0, + "completions/mean_length": 2536.416748046875, + "completions/mean_terminated_length": 2536.416748046875, + "completions/min_length": 1341.0, + "completions/min_terminated_length": 1341.0, + "epoch": 0.3337856173677069, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.15756388008594513, + "kl": 0.0, + "learning_rate": 3.390269151138716e-07, + "loss": -0.0005, + "num_tokens": 33048778.0, + "reward": 0.7333333492279053, + "reward_std": 0.051639772951602936, + "rewards/correctness_reward_func/mean": 0.43333330750465393, + "rewards/correctness_reward_func/std": 0.45792683959007263, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 984 + }, + { + "completion_length": 832.4166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1372.0, + "completions/max_terminated_length": 1372.0, + "completions/mean_length": 832.4166870117188, + "completions/mean_terminated_length": 832.4166870117188, + "completions/min_length": 550.0, + "completions/min_terminated_length": 550.0, + "epoch": 0.3341248303934871, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.04834370315074921, + "kl": 0.0, + "learning_rate": 3.388543823326432e-07, + "loss": -0.0004, + "num_tokens": 33074493.0, + "reward": 0.7833333015441895, + "reward_std": 0.040824808180332184, + "rewards/correctness_reward_func/mean": 0.4833333492279053, + "rewards/correctness_reward_func/std": 0.5078176856040955, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 985 + }, + { + "completion_length": 1010.5833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1571.0, + "completions/max_terminated_length": 1571.0, + "completions/mean_length": 1010.5833740234375, + "completions/mean_terminated_length": 1010.5833740234375, + "completions/min_length": 518.0, + "completions/min_terminated_length": 518.0, + "epoch": 0.3344640434192673, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.7334615165509604e-07, + "kl": 0.0, + "learning_rate": 3.386818495514148e-07, + "loss": 0.0, + "num_tokens": 33099076.0, + "reward": 1.1000001430511475, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 986 + }, + { + "completion_length": 1127.1666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3103.0, + "completions/max_terminated_length": 3103.0, + "completions/mean_length": 1127.166748046875, + "completions/mean_terminated_length": 1127.166748046875, + "completions/min_length": 447.0, + "completions/min_terminated_length": 447.0, + "epoch": 0.3348032564450475, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.3109779357910156, + "kl": 0.0, + "learning_rate": 3.385093167701863e-07, + "loss": -0.0022, + "num_tokens": 33124332.0, + "reward": 0.7666667699813843, + "reward_std": 0.1632993221282959, + "rewards/correctness_reward_func/mean": 0.46666669845581055, + "rewards/correctness_reward_func/std": 0.4119429290294647, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 987 + }, + { + "completion_length": 1063.7500305175781, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2977.0, + "completions/max_terminated_length": 2977.0, + "completions/mean_length": 1063.75, + "completions/mean_terminated_length": 1063.75, + "completions/min_length": 464.0, + "completions/min_terminated_length": 464.0, + "epoch": 0.33514246947082765, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.540890634059906, + "kl": 0.0, + "learning_rate": 3.383367839889579e-07, + "loss": 0.0257, + "num_tokens": 33149661.0, + "reward": 1.1166666746139526, + "reward_std": 0.24832773208618164, + "rewards/correctness_reward_func/mean": 0.8166666626930237, + "rewards/correctness_reward_func/std": 0.3857303261756897, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 988 + }, + { + "completion_length": 995.5833435058594, + "completions/clipped_ratio": 0.5, + "completions/max_length": 6589.0, + "completions/max_terminated_length": 3265.0, + "completions/mean_length": 4290.08349609375, + "completions/mean_terminated_length": 1991.166748046875, + "completions/min_length": 521.0, + "completions/min_terminated_length": 521.0, + "epoch": 0.33548168249660787, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": NaN, + "learning_rate": 3.3816425120772945e-07, + "loss": 0.0, + "num_tokens": 33172666.0, + "reward": 0.6499999761581421, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.5, + "rewards/correctness_reward_func/std": 0.5222329497337341, + "rewards/format_reward_func/mean": 0.15000000596046448, + "rewards/format_reward_func/std": 0.15666989982128143, + "step": 989 + }, + { + "completion_length": 1330.9166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2185.0, + "completions/max_terminated_length": 2185.0, + "completions/mean_length": 1330.916748046875, + "completions/mean_terminated_length": 1330.916748046875, + "completions/min_length": 842.0, + "completions/min_terminated_length": 842.0, + "epoch": 0.3358208955223881, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.36015501618385315, + "kl": 0.0, + "learning_rate": 3.3799171842650106e-07, + "loss": 0.0031, + "num_tokens": 33200667.0, + "reward": 1.1166667938232422, + "reward_std": 0.20412413775920868, + "rewards/correctness_reward_func/mean": 0.8166666030883789, + "rewards/correctness_reward_func/std": 0.27579087018966675, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 990 + }, + { + "completion_length": 1213.7500305175781, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2447.0, + "completions/max_terminated_length": 2447.0, + "completions/mean_length": 1213.75, + "completions/mean_terminated_length": 1213.75, + "completions/min_length": 768.0, + "completions/min_terminated_length": 768.0, + "epoch": 0.33616010854816825, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.6253594026238716e-07, + "kl": 0.0, + "learning_rate": 3.3781918564527256e-07, + "loss": 0.0, + "num_tokens": 33226422.0, + "reward": 0.7000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.4000000059604645, + "rewards/correctness_reward_func/std": 0.4177863895893097, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 991 + }, + { + "completion_length": 1514.7500610351562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3412.0, + "completions/max_terminated_length": 3412.0, + "completions/mean_length": 1514.75, + "completions/mean_terminated_length": 1514.75, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "epoch": 0.33649932157394846, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.840590471147152e-07, + "kl": 0.0, + "learning_rate": 3.3764665286404416e-07, + "loss": 0.0, + "num_tokens": 33257235.0, + "reward": 1.2000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 992 + }, + { + "completion_length": 794.7500305175781, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1140.0, + "completions/max_terminated_length": 1140.0, + "completions/mean_length": 794.75, + "completions/mean_terminated_length": 794.75, + "completions/min_length": 478.0, + "completions/min_terminated_length": 478.0, + "epoch": 0.3368385345997286, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.06748513877391815, + "kl": 0.0, + "learning_rate": 3.374741200828157e-07, + "loss": 0.0, + "num_tokens": 33277566.0, + "reward": 1.2833333015441895, + "reward_std": 0.040824808180332184, + "rewards/correctness_reward_func/mean": 0.9833333492279053, + "rewards/correctness_reward_func/std": 0.05773502215743065, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 993 + }, + { + "completion_length": 610.9166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1054.0, + "completions/max_terminated_length": 1054.0, + "completions/mean_length": 610.9166870117188, + "completions/mean_terminated_length": 610.9166870117188, + "completions/min_length": 361.0, + "completions/min_terminated_length": 361.0, + "epoch": 0.33717774762550884, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.049711357802152634, + "kl": 0.0, + "learning_rate": 3.373015873015873e-07, + "loss": 0.0005, + "num_tokens": 33298025.0, + "reward": 1.2166666984558105, + "reward_std": 0.040824826806783676, + "rewards/correctness_reward_func/mean": 0.9166666865348816, + "rewards/correctness_reward_func/std": 0.10298573225736618, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 994 + }, + { + "completion_length": 1622.4166870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4141.0, + "completions/max_terminated_length": 4141.0, + "completions/mean_length": 1622.416748046875, + "completions/mean_terminated_length": 1622.416748046875, + "completions/min_length": 766.0, + "completions/min_terminated_length": 766.0, + "epoch": 0.337516960651289, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.4773035943508148, + "kl": 0.0, + "learning_rate": 3.371290545203588e-07, + "loss": 0.0057, + "num_tokens": 33325894.0, + "reward": 1.1000001430511475, + "reward_std": 0.20000001788139343, + "rewards/correctness_reward_func/mean": 0.800000011920929, + "rewards/correctness_reward_func/std": 0.2696799635887146, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 995 + }, + { + "completion_length": 787.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1782.0, + "completions/max_terminated_length": 1782.0, + "completions/mean_length": 787.25, + "completions/mean_terminated_length": 787.25, + "completions/min_length": 481.0, + "completions/min_terminated_length": 481.0, + "epoch": 0.3378561736770692, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.43017894883269e-08, + "kl": 0.0, + "learning_rate": 3.369565217391304e-07, + "loss": 0.0, + "num_tokens": 33346219.0, + "reward": 1.2000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 996 + }, + { + "completion_length": 887.0833740234375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1226.0, + "completions/max_terminated_length": 1226.0, + "completions/mean_length": 887.0833740234375, + "completions/mean_terminated_length": 887.0833740234375, + "completions/min_length": 589.0, + "completions/min_terminated_length": 589.0, + "epoch": 0.3381953867028494, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.7791666095945402e-07, + "kl": 0.0, + "learning_rate": 3.36783988957902e-07, + "loss": 0.0, + "num_tokens": 33373244.0, + "reward": 1.2000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 997 + }, + { + "completion_length": 894.1666870117188, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1343.0, + "completions/max_terminated_length": 1343.0, + "completions/mean_length": 894.1666870117188, + "completions/mean_terminated_length": 894.1666870117188, + "completions/min_length": 571.0, + "completions/min_terminated_length": 571.0, + "epoch": 0.3385345997286296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 3.3661145617667353e-07, + "loss": 0.0, + "num_tokens": 33394702.0, + "reward": 1.2999999523162842, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 1.0, + "rewards/correctness_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 998 + }, + { + "completion_length": 802.0833435058594, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1790.0, + "completions/max_terminated_length": 1790.0, + "completions/mean_length": 802.0833740234375, + "completions/mean_terminated_length": 802.0833740234375, + "completions/min_length": 472.0, + "completions/min_terminated_length": 472.0, + "epoch": 0.33887381275440975, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1298748254776001, + "kl": 0.0, + "learning_rate": 3.364389233954451e-07, + "loss": 0.0004, + "num_tokens": 33415397.0, + "reward": 1.2333333492279053, + "reward_std": 0.10327950119972229, + "rewards/correctness_reward_func/mean": 0.9333333373069763, + "rewards/correctness_reward_func/std": 0.0984731912612915, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 999 + }, + { + "completion_length": 594.8333435058594, + "completions/clipped_ratio": 0.0, + "completions/max_length": 735.0, + "completions/max_terminated_length": 735.0, + "completions/mean_length": 594.8333740234375, + "completions/mean_terminated_length": 594.8333740234375, + "completions/min_length": 385.0, + "completions/min_terminated_length": 385.0, + "epoch": 0.33921302578018997, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.740247910220205e-07, + "kl": 0.0, + "learning_rate": 3.362663906142167e-07, + "loss": 0.0, + "num_tokens": 33435015.0, + "reward": 1.2000000476837158, + "reward_std": 0.0, + "rewards/correctness_reward_func/mean": 0.8999999165534973, + "rewards/correctness_reward_func/std": 0.10444658994674683, + "rewards/format_reward_func/mean": 0.30000001192092896, + "rewards/format_reward_func/std": 0.0, + "step": 1000 + } + ], + "logging_steps": 1, + "max_steps": 2948, + "num_input_tokens_seen": 33435015, + "num_train_epochs": 1, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 6, + "trial_name": null, + "trial_params": null +}