{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.0135, "eval_steps": 500, "global_step": 1350, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 8.421052932739258, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 3.331065446138382, "epoch": 1e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.067528136074543, "kl": 0.0, "learning_rate": 0.0, "loss": -0.0016, "num_tokens": 47914.0, "reward": -0.6903420090675354, "reward_std": 0.6007460951805115, "rewards/rollout_reward_func/mean": -0.6903420090675354, "rewards/rollout_reward_func/std": 0.6007461547851562, "sampling/importance_sampling_ratio/max": 0.5892598032951355, "sampling/importance_sampling_ratio/mean": 0.08479699492454529, "sampling/importance_sampling_ratio/min": 4.645641638489906e-06, "sampling/sampling_logp_difference/max": 2.1645991802215576, "sampling/sampling_logp_difference/mean": 0.42147812247276306, "step": 1, "step_time": 22.353196216994547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.331065446138382, "epoch": 2e-05, "grad_norm": 0.0651949793100357, "kl": 0.0, "learning_rate": 2.2857142857142855e-07, "loss": -0.0016, "step": 2, "step_time": 11.029836472007446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.09375, "completions/mean_terminated_length": 7.6666669845581055, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 3.2004198729991913, "epoch": 3e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.09031915664672852, "kl": 0.00026191053075308446, "learning_rate": 4.571428571428571e-07, "loss": -0.0132, "num_tokens": 94353.0, "reward": -0.5520539283752441, "reward_std": 0.99213707447052, "rewards/rollout_reward_func/mean": -0.5520539283752441, "rewards/rollout_reward_func/std": 0.9921370148658752, "sampling/importance_sampling_ratio/max": 0.5156832337379456, "sampling/importance_sampling_ratio/mean": 0.08639796823263168, "sampling/importance_sampling_ratio/min": 2.970448349515209e-07, "sampling/sampling_logp_difference/max": 2.2358927726745605, "sampling/sampling_logp_difference/mean": 0.3863257169723511, "step": 3, "step_time": 22.627653962001204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.1991601288318634, "epoch": 4e-05, "grad_norm": 0.08571849763393402, "kl": 0.00028220960120961536, "learning_rate": 6.857142857142857e-07, "loss": -0.0132, "step": 4, "step_time": 11.43730039098591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.09375, "completions/mean_terminated_length": 6.764706134796143, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.471197634935379, "epoch": 5e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.08415538817644119, "kl": 0.0002732964712777175, "learning_rate": 9.142857142857142e-07, "loss": -0.0133, "num_tokens": 142267.0, "reward": -0.5116921067237854, "reward_std": 0.838783860206604, "rewards/rollout_reward_func/mean": -0.5116921067237854, "rewards/rollout_reward_func/std": 0.8387837409973145, "sampling/importance_sampling_ratio/max": 0.43309977650642395, "sampling/importance_sampling_ratio/mean": 0.09898785501718521, "sampling/importance_sampling_ratio/min": 2.1658431705873227e-06, "sampling/sampling_logp_difference/max": 1.7770261764526367, "sampling/sampling_logp_difference/mean": 0.4502218961715698, "step": 5, "step_time": 21.646771896004793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.4712395668029785, "epoch": 6e-05, "grad_norm": 0.0843915194272995, "kl": 0.00026465590508450987, "learning_rate": 1.1428571428571428e-06, "loss": -0.0135, "step": 6, "step_time": 10.969789931012201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.96875, "completions/mean_terminated_length": 7.9375, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.6507151424884796, "epoch": 7e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.07285328209400177, "kl": 0.00020437307193788, "learning_rate": 1.3714285714285715e-06, "loss": -0.0142, "num_tokens": 197296.0, "reward": -0.2720484733581543, "reward_std": 0.9121120572090149, "rewards/rollout_reward_func/mean": -0.2720484733581543, "rewards/rollout_reward_func/std": 0.9121120572090149, "sampling/importance_sampling_ratio/max": 0.32150977849960327, "sampling/importance_sampling_ratio/mean": 0.06931693851947784, "sampling/importance_sampling_ratio/min": 8.520295580183301e-10, "sampling/sampling_logp_difference/max": 2.3938450813293457, "sampling/sampling_logp_difference/mean": 0.5402501225471497, "step": 7, "step_time": 23.502103034988977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.6505111157894135, "epoch": 8e-05, "grad_norm": 0.073671855032444, "kl": 0.00025187027858919464, "learning_rate": 1.6e-06, "loss": -0.0143, "step": 8, "step_time": 11.590045341014047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 11.96875, "completions/mean_terminated_length": 6.785714626312256, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 3.839500665664673, "epoch": 9e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.07723838835954666, "kl": 0.0003002350786118768, "learning_rate": 1.8285714285714284e-06, "loss": -0.0088, "num_tokens": 252728.0, "reward": -0.1511690467596054, "reward_std": 0.8019614815711975, "rewards/rollout_reward_func/mean": -0.1511690467596054, "rewards/rollout_reward_func/std": 0.8019614219665527, "sampling/importance_sampling_ratio/max": 0.44795000553131104, "sampling/importance_sampling_ratio/mean": 0.08499083667993546, "sampling/importance_sampling_ratio/min": 2.5295844352513086e-07, "sampling/sampling_logp_difference/max": 2.1178975105285645, "sampling/sampling_logp_difference/mean": 0.5516008138656616, "step": 9, "step_time": 23.382705292955507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.8413107991218567, "epoch": 0.0001, "grad_norm": 0.07787494361400604, "kl": 0.00028583988205355126, "learning_rate": 2.057142857142857e-06, "loss": -0.009, "step": 10, "step_time": 11.108348134031985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.40625, "completions/mean_terminated_length": 6.4166669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.686554729938507, "epoch": 0.00011, "frac_reward_zero_std": 0.0, "grad_norm": 0.061457496136426926, "kl": 0.00026123742827621754, "learning_rate": 2.2857142857142856e-06, "loss": -0.0132, "num_tokens": 300741.0, "reward": -0.3825490176677704, "reward_std": 0.889327347278595, "rewards/rollout_reward_func/mean": -0.3825490176677704, "rewards/rollout_reward_func/std": 0.8893274068832397, "sampling/importance_sampling_ratio/max": 0.4756196439266205, "sampling/importance_sampling_ratio/mean": 0.07964488863945007, "sampling/importance_sampling_ratio/min": 1.411704364517874e-10, "sampling/sampling_logp_difference/max": 2.475975513458252, "sampling/sampling_logp_difference/mean": 0.5196713805198669, "step": 11, "step_time": 20.932228493024013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.6807935535907745, "epoch": 0.00012, "grad_norm": 0.06095537915825844, "kl": 0.00023759044870530488, "learning_rate": 2.5142857142857142e-06, "loss": -0.0132, "step": 12, "step_time": 10.252664893007022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 11.6875, "completions/mean_terminated_length": 8.736842155456543, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 3.2808737456798553, "epoch": 0.00013, "frac_reward_zero_std": 0.0, "grad_norm": 0.04100925847887993, "kl": 0.000288915336568607, "learning_rate": 2.742857142857143e-06, "loss": -0.0064, "num_tokens": 351439.0, "reward": -0.28606170415878296, "reward_std": 0.9252208471298218, "rewards/rollout_reward_func/mean": -0.28606170415878296, "rewards/rollout_reward_func/std": 0.9252208471298218, "sampling/importance_sampling_ratio/max": 0.3630754351615906, "sampling/importance_sampling_ratio/mean": 0.08464658260345459, "sampling/importance_sampling_ratio/min": 1.7966332279684138e-06, "sampling/sampling_logp_difference/max": 1.9266515970230103, "sampling/sampling_logp_difference/mean": 0.3992263078689575, "step": 13, "step_time": 22.688022773028933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.2844612896442413, "epoch": 0.00014, "grad_norm": 0.04055075719952583, "kl": 0.000359028774255421, "learning_rate": 2.9714285714285716e-06, "loss": -0.0064, "step": 14, "step_time": 11.585252415010473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.5100991129875183, "epoch": 0.00015, "frac_reward_zero_std": 0.0, "grad_norm": 0.04466276988387108, "kl": 0.0002961961072287522, "learning_rate": 3.2e-06, "loss": -0.0069, "num_tokens": 403853.0, "reward": -0.41670897603034973, "reward_std": 0.7409546971321106, "rewards/rollout_reward_func/mean": -0.41670897603034973, "rewards/rollout_reward_func/std": 0.7409546971321106, "sampling/importance_sampling_ratio/max": 0.44819176197052, "sampling/importance_sampling_ratio/mean": 0.0780433714389801, "sampling/importance_sampling_ratio/min": 2.6252179097241424e-08, "sampling/sampling_logp_difference/max": 2.190891742706299, "sampling/sampling_logp_difference/mean": 0.4699859619140625, "step": 15, "step_time": 22.349329067990766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.5102231800556183, "epoch": 0.00016, "grad_norm": 0.04535643756389618, "kl": 0.00035153374847141095, "learning_rate": 3.428571428571428e-06, "loss": -0.007, "step": 16, "step_time": 11.182219613008783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 10.8125, "completions/mean_terminated_length": 7.263157844543457, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.463021218776703, "epoch": 0.00017, "frac_reward_zero_std": 0.0, "grad_norm": 0.08419129997491837, "kl": 0.0006196565373102203, "learning_rate": 3.657142857142857e-06, "loss": -0.014, "num_tokens": 451256.0, "reward": -0.44835376739501953, "reward_std": 0.9799845814704895, "rewards/rollout_reward_func/mean": -0.44835376739501953, "rewards/rollout_reward_func/std": 0.9799846410751343, "sampling/importance_sampling_ratio/max": 0.46037861704826355, "sampling/importance_sampling_ratio/mean": 0.10810332000255585, "sampling/importance_sampling_ratio/min": 7.598838003275432e-09, "sampling/sampling_logp_difference/max": 2.236638069152832, "sampling/sampling_logp_difference/mean": 0.46017351746559143, "step": 17, "step_time": 21.09552997298306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.4650664925575256, "epoch": 0.00018, "grad_norm": 0.0842791497707367, "kl": 0.0006023951391398441, "learning_rate": 3.885714285714286e-06, "loss": -0.014, "step": 18, "step_time": 10.96038374802447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.625, "completions/mean_terminated_length": 5.230769634246826, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 3.3152840435504913, "epoch": 0.00019, "frac_reward_zero_std": 0.0, "grad_norm": 0.03799505531787872, "kl": 0.0006445377293857746, "learning_rate": 4.114285714285714e-06, "loss": -0.0037, "num_tokens": 501463.0, "reward": -0.3104742169380188, "reward_std": 0.9027041792869568, "rewards/rollout_reward_func/mean": -0.3104742169380188, "rewards/rollout_reward_func/std": 0.9027041792869568, "sampling/importance_sampling_ratio/max": 0.4972159266471863, "sampling/importance_sampling_ratio/mean": 0.10313369333744049, "sampling/importance_sampling_ratio/min": 1.9375032422885852e-07, "sampling/sampling_logp_difference/max": 2.1190357208251953, "sampling/sampling_logp_difference/mean": 0.44657862186431885, "step": 19, "step_time": 23.25839726299455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.3099732995033264, "epoch": 0.0002, "grad_norm": 0.038678109645843506, "kl": 0.0008431800451944582, "learning_rate": 4.342857142857142e-06, "loss": -0.0038, "step": 20, "step_time": 11.461061434994917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 6.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.4496210515499115, "epoch": 0.00021, "frac_reward_zero_std": 0.0, "grad_norm": 0.05361214652657509, "kl": 0.0012927426141686738, "learning_rate": 4.571428571428571e-06, "loss": 0.0107, "num_tokens": 552686.0, "reward": -0.35486793518066406, "reward_std": 0.8452429175376892, "rewards/rollout_reward_func/mean": -0.35486793518066406, "rewards/rollout_reward_func/std": 0.8452429175376892, "sampling/importance_sampling_ratio/max": 0.3839678168296814, "sampling/importance_sampling_ratio/mean": 0.1069885790348053, "sampling/importance_sampling_ratio/min": 4.254477971699089e-06, "sampling/sampling_logp_difference/max": 2.219296932220459, "sampling/sampling_logp_difference/mean": 0.4475403130054474, "step": 21, "step_time": 22.79311488897656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.441692739725113, "epoch": 0.00022, "grad_norm": 0.054131582379341125, "kl": 0.0016429135357611813, "learning_rate": 4.8e-06, "loss": 0.0108, "step": 22, "step_time": 11.442528568019043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 12.21875, "completions/mean_terminated_length": 8.882352828979492, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 3.5638852417469025, "epoch": 0.00023, "frac_reward_zero_std": 0.0, "grad_norm": 0.07161016017198563, "kl": 0.0015593338466715068, "learning_rate": 5.0285714285714285e-06, "loss": -0.0077, "num_tokens": 600050.0, "reward": -0.2630671560764313, "reward_std": 1.1035856008529663, "rewards/rollout_reward_func/mean": -0.2630671560764313, "rewards/rollout_reward_func/std": 1.1035856008529663, "sampling/importance_sampling_ratio/max": 0.5658721327781677, "sampling/importance_sampling_ratio/mean": 0.09414377063512802, "sampling/importance_sampling_ratio/min": 3.2332579280591744e-08, "sampling/sampling_logp_difference/max": 2.8802013397216797, "sampling/sampling_logp_difference/mean": 0.4844944179058075, "step": 23, "step_time": 21.737700901998323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.558368593454361, "epoch": 0.00024, "grad_norm": 0.07405676692724228, "kl": 0.0018312205720576458, "learning_rate": 5.257142857142857e-06, "loss": -0.0079, "step": 24, "step_time": 10.834160281010554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 11.96875, "completions/mean_terminated_length": 6.785714626312256, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.21578249335289, "epoch": 0.00025, "frac_reward_zero_std": 0.0, "grad_norm": 0.16046564280986786, "kl": 0.0017883982072817162, "learning_rate": 5.485714285714286e-06, "loss": -0.0231, "num_tokens": 645631.0, "reward": -0.4874444603919983, "reward_std": 0.8735429644584656, "rewards/rollout_reward_func/mean": -0.4874444603919983, "rewards/rollout_reward_func/std": 0.8735429644584656, "sampling/importance_sampling_ratio/max": 0.5812484622001648, "sampling/importance_sampling_ratio/mean": 0.12126627564430237, "sampling/importance_sampling_ratio/min": 1.7416381524526514e-05, "sampling/sampling_logp_difference/max": 1.9635381698608398, "sampling/sampling_logp_difference/mean": 0.39675819873809814, "step": 25, "step_time": 20.937413668987574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.2116454541683197, "epoch": 0.00026, "grad_norm": 0.16915257275104523, "kl": 0.0023082312109181657, "learning_rate": 5.7142857142857145e-06, "loss": -0.0236, "step": 26, "step_time": 10.396956631026114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 11.34375, "completions/mean_terminated_length": 7.235294342041016, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 3.630482941865921, "epoch": 0.00027, "frac_reward_zero_std": 0.0, "grad_norm": 0.09839548170566559, "kl": 0.004891019969363697, "learning_rate": 5.942857142857143e-06, "loss": -0.0212, "num_tokens": 697071.0, "reward": 0.022112369537353516, "reward_std": 1.0570204257965088, "rewards/rollout_reward_func/mean": 0.022112369537353516, "rewards/rollout_reward_func/std": 1.0570204257965088, "sampling/importance_sampling_ratio/max": 0.6362606287002563, "sampling/importance_sampling_ratio/mean": 0.11613079905509949, "sampling/importance_sampling_ratio/min": 1.955389095087412e-08, "sampling/sampling_logp_difference/max": 2.9360551834106445, "sampling/sampling_logp_difference/mean": 0.5231769680976868, "step": 27, "step_time": 22.498547083028825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.6238242387771606, "epoch": 0.00028, "grad_norm": 0.09865443408489227, "kl": 0.006614872516365722, "learning_rate": 6.171428571428571e-06, "loss": -0.0218, "step": 28, "step_time": 11.552673565995065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 11.1875, "completions/mean_terminated_length": 6.941176414489746, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 3.516836106777191, "epoch": 0.00029, "frac_reward_zero_std": 0.0, "grad_norm": 0.11744082719087601, "kl": 0.00848110557126347, "learning_rate": 6.4e-06, "loss": -0.0204, "num_tokens": 750459.0, "reward": 0.029832355678081512, "reward_std": 0.9453108906745911, "rewards/rollout_reward_func/mean": 0.029832355678081512, "rewards/rollout_reward_func/std": 0.9453108906745911, "sampling/importance_sampling_ratio/max": 0.5640401244163513, "sampling/importance_sampling_ratio/mean": 0.1448831707239151, "sampling/importance_sampling_ratio/min": 8.166720135704963e-07, "sampling/sampling_logp_difference/max": 2.7389912605285645, "sampling/sampling_logp_difference/mean": 0.4838433265686035, "step": 29, "step_time": 23.156525827012956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.495699644088745, "epoch": 0.0003, "grad_norm": 0.12790432572364807, "kl": 0.01186994687304832, "learning_rate": 6.628571428571428e-06, "loss": -0.0214, "step": 30, "step_time": 11.431964931980474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 9.8125, "completions/mean_terminated_length": 6.099999904632568, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.2056614756584167, "epoch": 0.00031, "frac_reward_zero_std": 0.0, "grad_norm": 0.10904072970151901, "kl": 0.015143080381676555, "learning_rate": 6.857142857142856e-06, "loss": -0.0088, "num_tokens": 807650.0, "reward": 0.04746940732002258, "reward_std": 0.989336371421814, "rewards/rollout_reward_func/mean": 0.04746940732002258, "rewards/rollout_reward_func/std": 0.989336371421814, "sampling/importance_sampling_ratio/max": 0.628425657749176, "sampling/importance_sampling_ratio/mean": 0.2011766880750656, "sampling/importance_sampling_ratio/min": 1.929063500938355e-06, "sampling/sampling_logp_difference/max": 2.0493457317352295, "sampling/sampling_logp_difference/mean": 0.40734636783599854, "step": 31, "step_time": 22.683987885015085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.170373499393463, "epoch": 0.00032, "grad_norm": 0.11497274041175842, "kl": 0.02012204215861857, "learning_rate": 7.085714285714285e-06, "loss": -0.0097, "step": 32, "step_time": 11.299322177976137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 9.09375, "completions/mean_terminated_length": 6.3913044929504395, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 3.2048952877521515, "epoch": 0.00033, "frac_reward_zero_std": 0.0, "grad_norm": 0.1881689578294754, "kl": 0.04292509704828262, "learning_rate": 7.314285714285714e-06, "loss": -0.0248, "num_tokens": 859560.0, "reward": -0.25044524669647217, "reward_std": 0.9455000758171082, "rewards/rollout_reward_func/mean": -0.25044524669647217, "rewards/rollout_reward_func/std": 0.9455000162124634, "sampling/importance_sampling_ratio/max": 0.8529658913612366, "sampling/importance_sampling_ratio/mean": 0.24987104535102844, "sampling/importance_sampling_ratio/min": 1.893389907081655e-07, "sampling/sampling_logp_difference/max": 2.200258255004883, "sampling/sampling_logp_difference/mean": 0.4482192099094391, "step": 33, "step_time": 22.62012216100993 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.002314814832061529, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01012731483206153, "entropy": 3.1656673550605774, "epoch": 0.00034, "grad_norm": 0.12230686098337173, "kl": 0.05624912539497018, "learning_rate": 7.542857142857142e-06, "loss": -0.0268, "step": 34, "step_time": 11.201537562010344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.9375, "completions/mean_terminated_length": 6.300000190734863, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.975287616252899, "epoch": 0.00035, "frac_reward_zero_std": 0.0, "grad_norm": 0.12040606141090393, "kl": 0.06121919024735689, "learning_rate": 7.771428571428572e-06, "loss": -0.0063, "num_tokens": 903554.0, "reward": -0.17564575374126434, "reward_std": 1.1629403829574585, "rewards/rollout_reward_func/mean": -0.17564575374126434, "rewards/rollout_reward_func/std": 1.1629403829574585, "sampling/importance_sampling_ratio/max": 1.064441204071045, "sampling/importance_sampling_ratio/mean": 0.20792272686958313, "sampling/importance_sampling_ratio/min": 1.0429845360704348e-06, "sampling/sampling_logp_difference/max": 2.0465455055236816, "sampling/sampling_logp_difference/mean": 0.40237146615982056, "step": 35, "step_time": 21.90219988601166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.936492145061493, "epoch": 0.00036, "grad_norm": 0.12158767879009247, "kl": 0.07811583438888192, "learning_rate": 8e-06, "loss": -0.0069, "step": 36, "step_time": 11.011566410990781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.28125, "completions/mean_terminated_length": 5.26086950302124, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.6023616045713425, "epoch": 0.00037, "frac_reward_zero_std": 0.0, "grad_norm": 0.17067834734916687, "kl": 0.10389660205692053, "learning_rate": 7.99999999962976e-06, "loss": -0.0434, "num_tokens": 953679.0, "reward": -0.0540165975689888, "reward_std": 1.153792381286621, "rewards/rollout_reward_func/mean": -0.0540165975689888, "rewards/rollout_reward_func/std": 1.153792381286621, "sampling/importance_sampling_ratio/max": 1.2885006666183472, "sampling/importance_sampling_ratio/mean": 0.47981512546539307, "sampling/importance_sampling_ratio/min": 2.3645140956318755e-08, "sampling/sampling_logp_difference/max": 2.260319232940674, "sampling/sampling_logp_difference/mean": 0.3792839050292969, "step": 37, "step_time": 21.692669438023586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.5224302113056183, "epoch": 0.00038, "grad_norm": 0.1819714903831482, "kl": 0.1235731691122055, "learning_rate": 7.99999999851904e-06, "loss": -0.0448, "step": 38, "step_time": 12.145085800002562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 9.625, "completions/mean_terminated_length": 5.800000190734863, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.8431569933891296, "epoch": 0.00039, "frac_reward_zero_std": 0.0, "grad_norm": 0.18984355032444, "kl": 0.15226105693727732, "learning_rate": 7.999999996667841e-06, "loss": -0.0481, "num_tokens": 1000309.0, "reward": 0.0038850903511047363, "reward_std": 1.2777538299560547, "rewards/rollout_reward_func/mean": 0.0038850903511047363, "rewards/rollout_reward_func/std": 1.2777537107467651, "sampling/importance_sampling_ratio/max": 1.9498404264450073, "sampling/importance_sampling_ratio/mean": 0.4910849630832672, "sampling/importance_sampling_ratio/min": 4.330795988494174e-08, "sampling/sampling_logp_difference/max": 2.1275734901428223, "sampling/sampling_logp_difference/mean": 0.45745885372161865, "step": 39, "step_time": 20.061336310973275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00572916679084301, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00572916679084301, "entropy": 2.772933006286621, "epoch": 0.0004, "grad_norm": 0.18072468042373657, "kl": 0.19887211918830872, "learning_rate": 7.999999994076165e-06, "loss": -0.0511, "step": 40, "step_time": 10.50222828501137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 9.28125, "completions/mean_terminated_length": 5.25, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.645665764808655, "epoch": 0.00041, "frac_reward_zero_std": 0.0, "grad_norm": 0.25997859239578247, "kl": 0.3203405560925603, "learning_rate": 7.999999990744006e-06, "loss": -0.0711, "num_tokens": 1053849.0, "reward": -0.1301480084657669, "reward_std": 1.0133686065673828, "rewards/rollout_reward_func/mean": -0.1301480084657669, "rewards/rollout_reward_func/std": 1.0133686065673828, "sampling/importance_sampling_ratio/max": 1.8776235580444336, "sampling/importance_sampling_ratio/mean": 0.40281736850738525, "sampling/importance_sampling_ratio/min": 6.371399176481607e-13, "sampling/sampling_logp_difference/max": 2.9654440879821777, "sampling/sampling_logp_difference/mean": 0.5151076912879944, "step": 41, "step_time": 21.50482026899408 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.04268385819159448, "clip_ratio/low_min": 0.0062500000931322575, "clip_ratio/region_mean": 0.05049635819159448, "entropy": 2.5742486119270325, "epoch": 0.00042, "grad_norm": 0.16092771291732788, "kl": 0.42648048140108585, "learning_rate": 7.999999986671369e-06, "loss": -0.0751, "step": 42, "step_time": 11.271334022007068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.84375, "completions/mean_terminated_length": 5.535714626312256, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.9823518097400665, "epoch": 0.00043, "frac_reward_zero_std": 0.0, "grad_norm": 0.3459279537200928, "kl": 0.46911117248237133, "learning_rate": 7.999999981858253e-06, "loss": -0.119, "num_tokens": 1098771.0, "reward": 0.280550092458725, "reward_std": 1.2457724809646606, "rewards/rollout_reward_func/mean": 0.280550092458725, "rewards/rollout_reward_func/std": 1.2457724809646606, "sampling/importance_sampling_ratio/max": 2.3583717346191406, "sampling/importance_sampling_ratio/mean": 0.8540716171264648, "sampling/importance_sampling_ratio/min": 1.0337089406675659e-08, "sampling/sampling_logp_difference/max": 2.5524983406066895, "sampling/sampling_logp_difference/mean": 0.46814921498298645, "step": 43, "step_time": 21.147156746010296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.03046875004656613, "clip_ratio/low_min": 0.0078125, "clip_ratio/region_mean": 0.03046875004656613, "entropy": 1.9202526435256004, "epoch": 0.00044, "grad_norm": 0.2751235067844391, "kl": 0.5959468148648739, "learning_rate": 7.999999976304658e-06, "loss": -0.1239, "step": 44, "step_time": 11.318361593977897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.92307710647583, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8742617070674896, "epoch": 0.00045, "frac_reward_zero_std": 0.0, "grad_norm": 0.1728961169719696, "kl": 0.4488591104745865, "learning_rate": 7.999999970010581e-06, "loss": -0.0585, "num_tokens": 1148783.0, "reward": 0.013908922672271729, "reward_std": 1.008949875831604, "rewards/rollout_reward_func/mean": 0.013908922672271729, "rewards/rollout_reward_func/std": 1.008949875831604, "sampling/importance_sampling_ratio/max": 2.4314839839935303, "sampling/importance_sampling_ratio/mean": 0.844772219657898, "sampling/importance_sampling_ratio/min": 2.7535739377526625e-07, "sampling/sampling_logp_difference/max": 2.381523847579956, "sampling/sampling_logp_difference/mean": 0.42901575565338135, "step": 45, "step_time": 21.54666677201749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.028819444589316845, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.028819444589316845, "entropy": 1.814479999244213, "epoch": 0.00046, "grad_norm": 0.17651213705539703, "kl": 0.5856620073318481, "learning_rate": 7.999999962976027e-06, "loss": -0.0586, "step": 46, "step_time": 11.438097037986154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.6875, "completions/mean_terminated_length": 4.300000190734863, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.676477149128914, "epoch": 0.00047, "frac_reward_zero_std": 0.0, "grad_norm": 0.2729934751987457, "kl": 0.5742938099429011, "learning_rate": 7.999999955200991e-06, "loss": -0.048, "num_tokens": 1206478.0, "reward": 0.10402985662221909, "reward_std": 0.9449570178985596, "rewards/rollout_reward_func/mean": 0.10402985662221909, "rewards/rollout_reward_func/std": 0.9449570178985596, "sampling/importance_sampling_ratio/max": 2.567368507385254, "sampling/importance_sampling_ratio/mean": 0.7188885807991028, "sampling/importance_sampling_ratio/min": 1.454947380352678e-07, "sampling/sampling_logp_difference/max": 2.440084934234619, "sampling/sampling_logp_difference/mean": 0.5642155408859253, "step": 47, "step_time": 22.549174895015312 }, { "clip_ratio/high_max": 0.016406250186264515, "clip_ratio/high_mean": 0.008203125093132257, "clip_ratio/low_mean": 0.018694196827709675, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.026897321920841932, "entropy": 2.6487672328948975, "epoch": 0.00048, "grad_norm": 0.263638436794281, "kl": 0.4633852206170559, "learning_rate": 7.999999946685478e-06, "loss": -0.0494, "step": 48, "step_time": 12.250911051029107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.03125, "completions/mean_terminated_length": 4.607142925262451, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.1170806773006916, "epoch": 0.00049, "frac_reward_zero_std": 0.0, "grad_norm": 0.13287389278411865, "kl": 1.2291827723383904, "learning_rate": 7.999999937429484e-06, "loss": -0.0882, "num_tokens": 1248511.0, "reward": 1.1831860542297363, "reward_std": 1.2337859869003296, "rewards/rollout_reward_func/mean": 1.1831860542297363, "rewards/rollout_reward_func/std": 1.23378586769104, "sampling/importance_sampling_ratio/max": 2.558846950531006, "sampling/importance_sampling_ratio/mean": 1.0698950290679932, "sampling/importance_sampling_ratio/min": 2.1141720935702324e-05, "sampling/sampling_logp_difference/max": 3.399393320083618, "sampling/sampling_logp_difference/mean": 0.3007461726665497, "step": 49, "step_time": 19.712339470992447 }, { "clip_ratio/high_max": 0.0062500000931322575, "clip_ratio/high_mean": 0.0031250000465661287, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 1.1042792052030563, "epoch": 0.0005, "grad_norm": 0.14402571320533752, "kl": 1.2918741181492805, "learning_rate": 7.999999927433012e-06, "loss": -0.0878, "step": 50, "step_time": 10.97148370697687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.90625, "completions/mean_terminated_length": 6.5416669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.295081675052643, "epoch": 0.00051, "frac_reward_zero_std": 0.0, "grad_norm": 0.15618117153644562, "kl": 0.3060055747628212, "learning_rate": 7.99999991669606e-06, "loss": -0.0781, "num_tokens": 1306662.0, "reward": 0.22532258927822113, "reward_std": 1.0694938898086548, "rewards/rollout_reward_func/mean": 0.22532258927822113, "rewards/rollout_reward_func/std": 1.0694938898086548, "sampling/importance_sampling_ratio/max": 2.6759517192840576, "sampling/importance_sampling_ratio/mean": 0.6926659345626831, "sampling/importance_sampling_ratio/min": 5.844297845669644e-08, "sampling/sampling_logp_difference/max": 3.0116682052612305, "sampling/sampling_logp_difference/mean": 0.4620894193649292, "step": 51, "step_time": 21.51429315697169 }, { "clip_ratio/high_max": 0.0062500000931322575, "clip_ratio/high_mean": 0.0031250000465661287, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 2.283620446920395, "epoch": 0.00052, "grad_norm": 0.16130976378917694, "kl": 0.2779053458943963, "learning_rate": 7.999999905218627e-06, "loss": -0.0784, "step": 52, "step_time": 11.23550975600665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 7.5, "completions/mean_terminated_length": 5.119999885559082, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.8844577074050903, "epoch": 0.00053, "frac_reward_zero_std": 0.0, "grad_norm": 0.4120755195617676, "kl": 2.109809746965766, "learning_rate": 7.999999893000716e-06, "loss": -0.0627, "num_tokens": 1354896.0, "reward": 0.7022179365158081, "reward_std": 1.2377476692199707, "rewards/rollout_reward_func/mean": 0.7022179365158081, "rewards/rollout_reward_func/std": 1.2377475500106812, "sampling/importance_sampling_ratio/max": 2.9547407627105713, "sampling/importance_sampling_ratio/mean": 0.8223050832748413, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.207443714141846, "sampling/sampling_logp_difference/mean": 0.4134155511856079, "step": 53, "step_time": 20.61244194599567 }, { "clip_ratio/high_max": 0.021875000093132257, "clip_ratio/high_mean": 0.010937500046566129, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010937500046566129, "entropy": 1.8850480318069458, "epoch": 0.00054, "grad_norm": 0.28038620948791504, "kl": 1.5563265196979046, "learning_rate": 7.999999880042326e-06, "loss": -0.0648, "step": 54, "step_time": 10.5276128329715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 8.40625, "completions/mean_terminated_length": 5.4347825050354, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.071345701813698, "epoch": 0.00055, "frac_reward_zero_std": 0.0, "grad_norm": 0.20396509766578674, "kl": 0.5017293430864811, "learning_rate": 7.999999866343456e-06, "loss": -0.0298, "num_tokens": 1401485.0, "reward": 0.337041974067688, "reward_std": 1.3613700866699219, "rewards/rollout_reward_func/mean": 0.337041974067688, "rewards/rollout_reward_func/std": 1.3613700866699219, "sampling/importance_sampling_ratio/max": 2.4921514987945557, "sampling/importance_sampling_ratio/mean": 0.7316647171974182, "sampling/importance_sampling_ratio/min": 3.232344170100987e-05, "sampling/sampling_logp_difference/max": 2.2848079204559326, "sampling/sampling_logp_difference/mean": 0.39961713552474976, "step": 55, "step_time": 19.380627282022033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0800223648548126, "epoch": 0.00056, "grad_norm": 0.24171072244644165, "kl": 0.4566202610731125, "learning_rate": 7.999999851904105e-06, "loss": -0.0307, "step": 56, "step_time": 10.438556976005202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.875, "completions/mean_terminated_length": 6.6666669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.5670346170663834, "epoch": 0.00057, "frac_reward_zero_std": 0.0, "grad_norm": 0.3365034759044647, "kl": 1.211877339053899, "learning_rate": 7.999999836724277e-06, "loss": -0.0483, "num_tokens": 1454056.0, "reward": -0.2963150143623352, "reward_std": 0.7197708487510681, "rewards/rollout_reward_func/mean": -0.2963150143623352, "rewards/rollout_reward_func/std": 0.7197708487510681, "sampling/importance_sampling_ratio/max": 1.6314971446990967, "sampling/importance_sampling_ratio/mean": 0.41459834575653076, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.9592041969299316, "sampling/sampling_logp_difference/mean": 0.4395844340324402, "step": 57, "step_time": 22.362659216028987 }, { "clip_ratio/high_max": 0.028125000186264515, "clip_ratio/high_mean": 0.014062500093132257, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0062500000931322575, "clip_ratio/region_mean": 0.020312500186264515, "entropy": 2.5688358768820763, "epoch": 0.00058, "grad_norm": 0.14407040178775787, "kl": 1.01949120615609, "learning_rate": 7.999999820803968e-06, "loss": -0.0494, "step": 58, "step_time": 12.756437756004743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 8.34375, "completions/mean_terminated_length": 5.34782600402832, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.214706376194954, "epoch": 0.00059, "frac_reward_zero_std": 0.0, "grad_norm": 0.5055947303771973, "kl": 1.1532465554773808, "learning_rate": 7.99999980414318e-06, "loss": -0.056, "num_tokens": 1503359.0, "reward": 0.8433462381362915, "reward_std": 1.1465744972229004, "rewards/rollout_reward_func/mean": 0.8433462381362915, "rewards/rollout_reward_func/std": 1.1465744972229004, "sampling/importance_sampling_ratio/max": 2.8415918350219727, "sampling/importance_sampling_ratio/mean": 0.6990048885345459, "sampling/importance_sampling_ratio/min": 6.029413270880468e-06, "sampling/sampling_logp_difference/max": 3.044267177581787, "sampling/sampling_logp_difference/mean": 0.4517472982406616, "step": 59, "step_time": 21.701349122988177 }, { "clip_ratio/high_max": 0.019497282803058624, "clip_ratio/high_mean": 0.012352808145806193, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012352808145806193, "entropy": 2.217276468873024, "epoch": 0.0006, "grad_norm": 0.21651878952980042, "kl": 0.63127619959414, "learning_rate": 7.999999786741913e-06, "loss": -0.0608, "step": 60, "step_time": 11.615051335989847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 8.21875, "completions/mean_terminated_length": 5.625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.1759772896766663, "epoch": 0.00061, "frac_reward_zero_std": 0.0, "grad_norm": 0.24702158570289612, "kl": 0.937526430003345, "learning_rate": 7.999999768600167e-06, "loss": -0.0591, "num_tokens": 1552641.0, "reward": 0.1664169430732727, "reward_std": 1.2512215375900269, "rewards/rollout_reward_func/mean": 0.1664169430732727, "rewards/rollout_reward_func/std": 1.2512215375900269, "sampling/importance_sampling_ratio/max": 2.725044012069702, "sampling/importance_sampling_ratio/mean": 0.6385691165924072, "sampling/importance_sampling_ratio/min": 2.9238407250886667e-09, "sampling/sampling_logp_difference/max": 3.105861186981201, "sampling/sampling_logp_difference/mean": 0.40863049030303955, "step": 61, "step_time": 20.097428161025164 }, { "clip_ratio/high_max": 0.015625000465661287, "clip_ratio/high_mean": 0.007812500232830644, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007812500232830644, "entropy": 2.1863250583410263, "epoch": 0.00062, "grad_norm": 0.24142329394817352, "kl": 0.7550843581557274, "learning_rate": 7.99999974971794e-06, "loss": -0.0597, "step": 62, "step_time": 10.572766303957906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 8.3125, "completions/mean_terminated_length": 5.304347991943359, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.1445166021585464, "epoch": 0.00063, "frac_reward_zero_std": 0.0, "grad_norm": 0.18167690932750702, "kl": 0.409351022914052, "learning_rate": 7.999999730095235e-06, "loss": -0.0331, "num_tokens": 1603097.0, "reward": 0.3788607120513916, "reward_std": 1.1366214752197266, "rewards/rollout_reward_func/mean": 0.3788607120513916, "rewards/rollout_reward_func/std": 1.1366214752197266, "sampling/importance_sampling_ratio/max": 2.910705804824829, "sampling/importance_sampling_ratio/mean": 0.7864817380905151, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.3020458221435547, "sampling/sampling_logp_difference/mean": 0.4240483045578003, "step": 63, "step_time": 21.859037008005544 }, { "clip_ratio/high_max": 0.005434782709926367, "clip_ratio/high_mean": 0.0027173913549631834, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0027173913549631834, "entropy": 2.1572662740945816, "epoch": 0.00064, "grad_norm": 0.14156609773635864, "kl": 0.3361666686832905, "learning_rate": 7.99999970973205e-06, "loss": -0.0341, "step": 64, "step_time": 10.658063309980207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.84375, "completions/mean_terminated_length": 5.095238208770752, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.3517883867025375, "epoch": 0.00065, "frac_reward_zero_std": 0.0, "grad_norm": 0.09635261446237564, "kl": 0.685079051181674, "learning_rate": 7.999999688628386e-06, "loss": -0.0551, "num_tokens": 1654963.0, "reward": 0.26505351066589355, "reward_std": 1.1865493059158325, "rewards/rollout_reward_func/mean": 0.26505351066589355, "rewards/rollout_reward_func/std": 1.1865493059158325, "sampling/importance_sampling_ratio/max": 2.0687918663024902, "sampling/importance_sampling_ratio/mean": 0.41873979568481445, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.5207619667053223, "sampling/sampling_logp_difference/mean": 0.4279531240463257, "step": 65, "step_time": 22.235079823018168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.3588332533836365, "epoch": 0.00066, "grad_norm": 0.0948866754770279, "kl": 0.612475760281086, "learning_rate": 7.999999666784243e-06, "loss": -0.0557, "step": 66, "step_time": 11.527755821007304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.9375, "completions/mean_terminated_length": 5.259259223937988, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.9885903745889664, "epoch": 0.00067, "frac_reward_zero_std": 0.0, "grad_norm": 0.1488756388425827, "kl": 0.3400424728170037, "learning_rate": 7.999999644199619e-06, "loss": -0.0971, "num_tokens": 1707281.0, "reward": 0.7047034502029419, "reward_std": 1.1272485256195068, "rewards/rollout_reward_func/mean": 0.7047034502029419, "rewards/rollout_reward_func/std": 1.1272484064102173, "sampling/importance_sampling_ratio/max": 1.924314260482788, "sampling/importance_sampling_ratio/mean": 0.7478386163711548, "sampling/importance_sampling_ratio/min": 4.228686975693563e-06, "sampling/sampling_logp_difference/max": 1.9878804683685303, "sampling/sampling_logp_difference/mean": 0.3430236577987671, "step": 67, "step_time": 21.475633871988975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9897891581058502, "epoch": 0.00068, "grad_norm": 0.14236347377300262, "kl": 0.3326695756986737, "learning_rate": 7.999999620874517e-06, "loss": -0.0976, "step": 68, "step_time": 11.964250919991173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.3125, "completions/mean_terminated_length": 5.75, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.3158281445503235, "epoch": 0.00069, "frac_reward_zero_std": 0.0, "grad_norm": 0.1856992244720459, "kl": 0.2652229554951191, "learning_rate": 7.999999596808934e-06, "loss": -0.0539, "num_tokens": 1760948.0, "reward": 0.21376898884773254, "reward_std": 1.0116567611694336, "rewards/rollout_reward_func/mean": 0.21376898884773254, "rewards/rollout_reward_func/std": 1.0116567611694336, "sampling/importance_sampling_ratio/max": 2.1901795864105225, "sampling/importance_sampling_ratio/mean": 0.6353507041931152, "sampling/importance_sampling_ratio/min": 1.4362026012904039e-09, "sampling/sampling_logp_difference/max": 2.4853687286376953, "sampling/sampling_logp_difference/mean": 0.46979326009750366, "step": 69, "step_time": 21.781032468978083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "entropy": 2.3077437430620193, "epoch": 0.0007, "grad_norm": 0.18427376449108124, "kl": 0.2947603799402714, "learning_rate": 7.999999572002872e-06, "loss": -0.0543, "step": 70, "step_time": 11.292453254005522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 8.21875, "completions/mean_terminated_length": 6.039999961853027, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.8950841575860977, "epoch": 0.00071, "frac_reward_zero_std": 0.0, "grad_norm": 0.17359010875225067, "kl": 0.45538472197949886, "learning_rate": 7.999999546456332e-06, "loss": -0.0553, "num_tokens": 1818221.0, "reward": 0.03623851761221886, "reward_std": 0.9123998284339905, "rewards/rollout_reward_func/mean": 0.03623851761221886, "rewards/rollout_reward_func/std": 0.9123997688293457, "sampling/importance_sampling_ratio/max": 2.169347047805786, "sampling/importance_sampling_ratio/mean": 0.6932936310768127, "sampling/importance_sampling_ratio/min": 0.00019197478832211345, "sampling/sampling_logp_difference/max": 2.209782600402832, "sampling/sampling_logp_difference/mean": 0.3537505567073822, "step": 71, "step_time": 21.854020272003254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.003289473708719015, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003289473708719015, "entropy": 1.8821485862135887, "epoch": 0.00072, "grad_norm": 0.16973605751991272, "kl": 0.4794400781393051, "learning_rate": 7.999999520169313e-06, "loss": -0.0559, "step": 72, "step_time": 11.374571235995973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 7.65625, "completions/mean_terminated_length": 4.875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.9766903147101402, "epoch": 0.00073, "frac_reward_zero_std": 0.0, "grad_norm": 0.17931640148162842, "kl": 0.31496390886604786, "learning_rate": 7.999999493141815e-06, "loss": -0.1172, "num_tokens": 1871475.0, "reward": 0.5627253651618958, "reward_std": 1.1313332319259644, "rewards/rollout_reward_func/mean": 0.5627253651618958, "rewards/rollout_reward_func/std": 1.1313332319259644, "sampling/importance_sampling_ratio/max": 2.1344687938690186, "sampling/importance_sampling_ratio/mean": 0.7493089437484741, "sampling/importance_sampling_ratio/min": 1.357847594363193e-07, "sampling/sampling_logp_difference/max": 1.9951118230819702, "sampling/sampling_logp_difference/mean": 0.3899509310722351, "step": 73, "step_time": 21.964024004002567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.01122510340064764, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01122510340064764, "entropy": 1.9546855986118317, "epoch": 0.00074, "grad_norm": 0.16608695685863495, "kl": 0.36779424641281366, "learning_rate": 7.999999465373833e-06, "loss": -0.1175, "step": 74, "step_time": 11.211157774014282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.59375, "completions/mean_terminated_length": 4.517241477966309, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2464581280946732, "epoch": 0.00075, "frac_reward_zero_std": 0.0, "grad_norm": 0.23011654615402222, "kl": 0.8504077009856701, "learning_rate": 7.999999436865376e-06, "loss": -0.1337, "num_tokens": 1919515.0, "reward": 0.9816526174545288, "reward_std": 1.1029695272445679, "rewards/rollout_reward_func/mean": 0.9816526174545288, "rewards/rollout_reward_func/std": 1.1029695272445679, "sampling/importance_sampling_ratio/max": 2.801919937133789, "sampling/importance_sampling_ratio/mean": 1.0507925748825073, "sampling/importance_sampling_ratio/min": 2.5113224637607345e-06, "sampling/sampling_logp_difference/max": 2.070040702819824, "sampling/sampling_logp_difference/mean": 0.2812731862068176, "step": 75, "step_time": 20.705081060994416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 1.2200487665832043, "epoch": 0.00076, "grad_norm": 0.16792452335357666, "kl": 0.9943973766639829, "learning_rate": 7.999999407616439e-06, "loss": -0.1352, "step": 76, "step_time": 10.978833107990795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 6.6875, "completions/mean_terminated_length": 4.538461685180664, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.319789569824934, "epoch": 0.00077, "frac_reward_zero_std": 0.0, "grad_norm": 0.39255908131599426, "kl": 0.6041842587292194, "learning_rate": 7.999999377627022e-06, "loss": -0.0555, "num_tokens": 1966218.0, "reward": 0.5327593088150024, "reward_std": 1.2519580125808716, "rewards/rollout_reward_func/mean": 0.5327593088150024, "rewards/rollout_reward_func/std": 1.2519581317901611, "sampling/importance_sampling_ratio/max": 2.2719550132751465, "sampling/importance_sampling_ratio/mean": 0.9220836758613586, "sampling/importance_sampling_ratio/min": 6.080873617975158e-07, "sampling/sampling_logp_difference/max": 2.3046875, "sampling/sampling_logp_difference/mean": 0.32381758093833923, "step": 77, "step_time": 20.3111697689601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.3053762465715408, "epoch": 0.00078, "grad_norm": 0.2128647416830063, "kl": 0.6581669319421053, "learning_rate": 7.999999346897126e-06, "loss": -0.0564, "step": 78, "step_time": 12.071838153977296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 5.875, "completions/mean_terminated_length": 4.428571701049805, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.042374961078167, "epoch": 0.00079, "frac_reward_zero_std": 0.0, "grad_norm": 0.13274335861206055, "kl": 0.3875476270914078, "learning_rate": 7.99999931542675e-06, "loss": -0.0759, "num_tokens": 2016866.0, "reward": 0.8403415083885193, "reward_std": 1.2454222440719604, "rewards/rollout_reward_func/mean": 0.8403415083885193, "rewards/rollout_reward_func/std": 1.2454222440719604, "sampling/importance_sampling_ratio/max": 1.86415433883667, "sampling/importance_sampling_ratio/mean": 0.9594419002532959, "sampling/importance_sampling_ratio/min": 0.00029570047627203166, "sampling/sampling_logp_difference/max": 2.4775633811950684, "sampling/sampling_logp_difference/mean": 0.21983076632022858, "step": 79, "step_time": 21.600077672977932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0245741941034794, "epoch": 0.0008, "grad_norm": 0.1261986494064331, "kl": 0.38688701018691063, "learning_rate": 7.999999283215897e-06, "loss": -0.0766, "step": 80, "step_time": 11.605201179991127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 6.03125, "completions/mean_terminated_length": 4.185185432434082, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.3075156286358833, "epoch": 0.00081, "frac_reward_zero_std": 0.0, "grad_norm": 0.14337988197803497, "kl": 0.7996266148984432, "learning_rate": 7.999999250264562e-06, "loss": -0.0255, "num_tokens": 2071256.0, "reward": 0.27596843242645264, "reward_std": 1.1197608709335327, "rewards/rollout_reward_func/mean": 0.27596843242645264, "rewards/rollout_reward_func/std": 1.1197607517242432, "sampling/importance_sampling_ratio/max": 1.811834454536438, "sampling/importance_sampling_ratio/mean": 0.8423858880996704, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.7675957679748535, "sampling/sampling_logp_difference/mean": 0.42735350131988525, "step": 81, "step_time": 19.691832979995525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2901870235800743, "epoch": 0.00082, "grad_norm": 0.13614077866077423, "kl": 0.8218964375555515, "learning_rate": 7.999999216572749e-06, "loss": -0.0259, "step": 82, "step_time": 10.687911717992392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.59375, "completions/mean_terminated_length": 5.25, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.3780150339007378, "epoch": 0.00083, "frac_reward_zero_std": 0.0, "grad_norm": 0.3587356209754944, "kl": 1.195137519389391, "learning_rate": 7.999999182140456e-06, "loss": -0.0327, "num_tokens": 2126562.0, "reward": 0.1801716834306717, "reward_std": 0.9977620840072632, "rewards/rollout_reward_func/mean": 0.1801716834306717, "rewards/rollout_reward_func/std": 0.9977620840072632, "sampling/importance_sampling_ratio/max": 1.8190855979919434, "sampling/importance_sampling_ratio/mean": 0.9234470129013062, "sampling/importance_sampling_ratio/min": 4.757030183100142e-06, "sampling/sampling_logp_difference/max": 3.2056727409362793, "sampling/sampling_logp_difference/mean": 0.35230642557144165, "step": 83, "step_time": 21.653691258019535 }, { "clip_ratio/high_max": 0.0052083334885537624, "clip_ratio/high_mean": 0.0026041667442768812, "clip_ratio/low_mean": 0.008928571827709675, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0115327388048172, "entropy": 1.367248360067606, "epoch": 0.00084, "grad_norm": 0.29125550389289856, "kl": 1.1901373639702797, "learning_rate": 7.999999146967684e-06, "loss": -0.0342, "step": 84, "step_time": 11.062429140991298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.34375, "completions/mean_terminated_length": 5.344827651977539, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.954867422580719, "epoch": 0.00085, "frac_reward_zero_std": 0.0, "grad_norm": 0.2506013512611389, "kl": 1.4398251175880432, "learning_rate": 7.999999111054434e-06, "loss": -0.0611, "num_tokens": 2169711.0, "reward": 0.7761626243591309, "reward_std": 1.3720297813415527, "rewards/rollout_reward_func/mean": 0.7761626243591309, "rewards/rollout_reward_func/std": 1.3720297813415527, "sampling/importance_sampling_ratio/max": 1.7814128398895264, "sampling/importance_sampling_ratio/mean": 0.7897238731384277, "sampling/importance_sampling_ratio/min": 1.43541819852544e-05, "sampling/sampling_logp_difference/max": 2.748796224594116, "sampling/sampling_logp_difference/mean": 0.28658241033554077, "step": 85, "step_time": 18.990149877994554 }, { "clip_ratio/high_max": 0.013521634973585606, "clip_ratio/high_mean": 0.006760817486792803, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006760817486792803, "entropy": 0.958494059741497, "epoch": 0.00086, "grad_norm": 0.20220187306404114, "kl": 1.328818928450346, "learning_rate": 7.999999074400703e-06, "loss": -0.0621, "step": 86, "step_time": 10.22814323300554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 7.28125, "completions/mean_terminated_length": 4.375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.3115105517208576, "epoch": 0.00087, "frac_reward_zero_std": 0.0, "grad_norm": 0.11934467405080795, "kl": 0.8390046562999487, "learning_rate": 7.999999037006494e-06, "loss": -0.1054, "num_tokens": 2222777.0, "reward": 0.7650569677352905, "reward_std": 1.221474528312683, "rewards/rollout_reward_func/mean": 0.7650569677352905, "rewards/rollout_reward_func/std": 1.2214746475219727, "sampling/importance_sampling_ratio/max": 1.8056381940841675, "sampling/importance_sampling_ratio/mean": 0.8305700421333313, "sampling/importance_sampling_ratio/min": 6.367351943481481e-06, "sampling/sampling_logp_difference/max": 2.868248701095581, "sampling/sampling_logp_difference/mean": 0.31630682945251465, "step": 87, "step_time": 21.808123193011852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.3068051598966122, "epoch": 0.00088, "grad_norm": 0.09801376610994339, "kl": 0.900336816906929, "learning_rate": 7.999998998871805e-06, "loss": -0.105, "step": 88, "step_time": 11.760031982019427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.09375, "completions/mean_terminated_length": 4.599999904632568, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.650472529232502, "epoch": 0.00089, "frac_reward_zero_std": 0.0, "grad_norm": 0.14199486374855042, "kl": 0.41304353810846806, "learning_rate": 7.999998959996637e-06, "loss": -0.0636, "num_tokens": 2272726.0, "reward": 0.2677881121635437, "reward_std": 1.0267105102539062, "rewards/rollout_reward_func/mean": 0.2677881121635437, "rewards/rollout_reward_func/std": 1.0267105102539062, "sampling/importance_sampling_ratio/max": 1.965194582939148, "sampling/importance_sampling_ratio/mean": 0.9101623296737671, "sampling/importance_sampling_ratio/min": 4.2385058804939035e-06, "sampling/sampling_logp_difference/max": 2.4292781352996826, "sampling/sampling_logp_difference/mean": 0.3541734218597412, "step": 89, "step_time": 21.20099919900531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6540691405534744, "epoch": 0.0009, "grad_norm": 0.14296142756938934, "kl": 0.4005184266716242, "learning_rate": 7.99999892038099e-06, "loss": -0.0637, "step": 90, "step_time": 11.428621466984623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.40625, "completions/mean_terminated_length": 4.954545497894287, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2521687671542168, "epoch": 0.00091, "frac_reward_zero_std": 0.0, "grad_norm": 0.173726424574852, "kl": 0.32821700163185596, "learning_rate": 7.999998880024863e-06, "loss": -0.0926, "num_tokens": 2330559.0, "reward": 0.3225027620792389, "reward_std": 1.0228989124298096, "rewards/rollout_reward_func/mean": 0.3225027620792389, "rewards/rollout_reward_func/std": 1.0228989124298096, "sampling/importance_sampling_ratio/max": 1.9962167739868164, "sampling/importance_sampling_ratio/mean": 0.7478345632553101, "sampling/importance_sampling_ratio/min": 5.041103090519528e-10, "sampling/sampling_logp_difference/max": 2.7169888019561768, "sampling/sampling_logp_difference/mean": 0.43760234117507935, "step": 91, "step_time": 22.751491838978836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.249836929142475, "epoch": 0.00092, "grad_norm": 0.17070414125919342, "kl": 0.33538318797945976, "learning_rate": 7.999998838928257e-06, "loss": -0.0934, "step": 92, "step_time": 12.635656431972166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 8.34375, "completions/mean_terminated_length": 5.7916669845581055, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.8899141028523445, "epoch": 0.00093, "frac_reward_zero_std": 0.0, "grad_norm": 0.13530664145946503, "kl": 0.23707675142213702, "learning_rate": 7.999998797091172e-06, "loss": -0.0463, "num_tokens": 2378544.0, "reward": 0.7533355355262756, "reward_std": 1.186990737915039, "rewards/rollout_reward_func/mean": 0.7533355355262756, "rewards/rollout_reward_func/std": 1.186990737915039, "sampling/importance_sampling_ratio/max": 1.5323114395141602, "sampling/importance_sampling_ratio/mean": 0.6856932640075684, "sampling/importance_sampling_ratio/min": 1.2039871762681287e-05, "sampling/sampling_logp_difference/max": 1.8271903991699219, "sampling/sampling_logp_difference/mean": 0.3230045437812805, "step": 93, "step_time": 22.188623535024817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.892941951751709, "epoch": 0.00094, "grad_norm": 0.13837508857250214, "kl": 0.2382907117716968, "learning_rate": 7.999998754513608e-06, "loss": -0.0467, "step": 94, "step_time": 11.397554528986802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 8.4375, "completions/mean_terminated_length": 5.4782609939575195, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.087985634803772, "epoch": 0.00095, "frac_reward_zero_std": 0.0, "grad_norm": 0.1431470513343811, "kl": 0.7957198191434145, "learning_rate": 7.999998711195565e-06, "loss": -0.0864, "num_tokens": 2426816.0, "reward": 0.30910369753837585, "reward_std": 1.258506417274475, "rewards/rollout_reward_func/mean": 0.30910369753837585, "rewards/rollout_reward_func/std": 1.258506417274475, "sampling/importance_sampling_ratio/max": 2.0108141899108887, "sampling/importance_sampling_ratio/mean": 0.6902321577072144, "sampling/importance_sampling_ratio/min": 3.472003129445511e-08, "sampling/sampling_logp_difference/max": 2.0805647373199463, "sampling/sampling_logp_difference/mean": 0.4431988298892975, "step": 95, "step_time": 21.918470926990267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 2.083781599998474, "epoch": 0.00096, "grad_norm": 0.1074133813381195, "kl": 0.8004867471754551, "learning_rate": 7.999998667137043e-06, "loss": -0.0869, "step": 96, "step_time": 10.970348244023626 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 8.0625, "completions/mean_terminated_length": 5.4166669845581055, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.9682438522577286, "epoch": 0.00097, "frac_reward_zero_std": 0.0, "grad_norm": 0.4791388511657715, "kl": 1.8114536702632904, "learning_rate": 7.999998622338041e-06, "loss": -0.0456, "num_tokens": 2477930.0, "reward": -0.05221004784107208, "reward_std": 1.073703646659851, "rewards/rollout_reward_func/mean": -0.05221004784107208, "rewards/rollout_reward_func/std": 1.073703646659851, "sampling/importance_sampling_ratio/max": 1.726181149482727, "sampling/importance_sampling_ratio/mean": 0.5065386295318604, "sampling/importance_sampling_ratio/min": 5.254844381852308e-07, "sampling/sampling_logp_difference/max": 2.8092148303985596, "sampling/sampling_logp_difference/mean": 0.4455947279930115, "step": 97, "step_time": 22.803125051985262 }, { "clip_ratio/high_max": 0.046875, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.01205357164144516, "clip_ratio/low_min": 0.0062500000931322575, "clip_ratio/region_mean": 0.03549107164144516, "entropy": 1.9618272930383682, "epoch": 0.00098, "grad_norm": 0.31828388571739197, "kl": 1.783431764692068, "learning_rate": 7.999998576798562e-06, "loss": -0.0479, "step": 98, "step_time": 11.516381426990847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.3125, "completions/mean_terminated_length": 6.538461685180664, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.815392792224884, "epoch": 0.00099, "frac_reward_zero_std": 0.0, "grad_norm": 0.1634523868560791, "kl": 0.788344094529748, "learning_rate": 7.999998530518601e-06, "loss": -0.0661, "num_tokens": 2522722.0, "reward": 0.14052090048789978, "reward_std": 1.3531723022460938, "rewards/rollout_reward_func/mean": 0.14052090048789978, "rewards/rollout_reward_func/std": 1.3531723022460938, "sampling/importance_sampling_ratio/max": 1.4729195833206177, "sampling/importance_sampling_ratio/mean": 0.5962989330291748, "sampling/importance_sampling_ratio/min": 5.9299841268511955e-06, "sampling/sampling_logp_difference/max": 2.5110418796539307, "sampling/sampling_logp_difference/mean": 0.37998253107070923, "step": 99, "step_time": 19.158886989040184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.8231475055217743, "epoch": 0.001, "grad_norm": 0.12461274117231369, "kl": 0.6930776899680495, "learning_rate": 7.999998483498162e-06, "loss": -0.067, "step": 100, "step_time": 10.34934641001746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.25, "completions/mean_terminated_length": 5.230769634246826, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7527764216065407, "epoch": 0.00101, "frac_reward_zero_std": 0.0, "grad_norm": 0.09975414723157883, "kl": 1.4448762349784374, "learning_rate": 7.999998435737244e-06, "loss": -0.1049, "num_tokens": 2574908.0, "reward": 0.8554249405860901, "reward_std": 1.1666871309280396, "rewards/rollout_reward_func/mean": 0.8554249405860901, "rewards/rollout_reward_func/std": 1.16668701171875, "sampling/importance_sampling_ratio/max": 1.9528539180755615, "sampling/importance_sampling_ratio/mean": 0.6642270684242249, "sampling/importance_sampling_ratio/min": 0.0005284405197016895, "sampling/sampling_logp_difference/max": 2.6062912940979004, "sampling/sampling_logp_difference/mean": 0.3681985139846802, "step": 101, "step_time": 21.68729741302377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.765073724091053, "epoch": 0.00102, "grad_norm": 0.0840354636311531, "kl": 1.1931252162903547, "learning_rate": 7.999998387235846e-06, "loss": -0.1056, "step": 102, "step_time": 12.586994933983078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.5625, "completions/mean_terminated_length": 6.190476417541504, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8910994678735733, "epoch": 0.00103, "frac_reward_zero_std": 0.0, "grad_norm": 0.08568953722715378, "kl": 0.4249183277133852, "learning_rate": 7.99999833799397e-06, "loss": -0.1004, "num_tokens": 2623386.0, "reward": 0.3800339698791504, "reward_std": 1.2790268659591675, "rewards/rollout_reward_func/mean": 0.3800339698791504, "rewards/rollout_reward_func/std": 1.2790268659591675, "sampling/importance_sampling_ratio/max": 2.7746803760528564, "sampling/importance_sampling_ratio/mean": 0.6060765385627747, "sampling/importance_sampling_ratio/min": 2.4925886464188807e-05, "sampling/sampling_logp_difference/max": 2.1550750732421875, "sampling/sampling_logp_difference/mean": 0.35591739416122437, "step": 103, "step_time": 20.98358174400346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8945105522871017, "epoch": 0.00104, "grad_norm": 0.08693988621234894, "kl": 0.385336289415136, "learning_rate": 7.999998288011616e-06, "loss": -0.1003, "step": 104, "step_time": 10.484735538004315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.46875, "completions/mean_terminated_length": 4.625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.585211869329214, "epoch": 0.00105, "frac_reward_zero_std": 0.0, "grad_norm": 0.31320199370384216, "kl": 0.37785591930150986, "learning_rate": 7.999998237288781e-06, "loss": -0.0528, "num_tokens": 2683971.0, "reward": 0.5772043466567993, "reward_std": 0.899491012096405, "rewards/rollout_reward_func/mean": 0.5772043466567993, "rewards/rollout_reward_func/std": 0.899491012096405, "sampling/importance_sampling_ratio/max": 2.6374542713165283, "sampling/importance_sampling_ratio/mean": 1.0190811157226562, "sampling/importance_sampling_ratio/min": 1.2786565093847457e-05, "sampling/sampling_logp_difference/max": 2.07917857170105, "sampling/sampling_logp_difference/mean": 0.3539807200431824, "step": 105, "step_time": 22.357248725995305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.586122289299965, "epoch": 0.00106, "grad_norm": 0.29766929149627686, "kl": 0.406141871586442, "learning_rate": 7.999998185825468e-06, "loss": -0.0537, "step": 106, "step_time": 11.692759429992293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 10.40625, "completions/mean_terminated_length": 6.5789475440979, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.216046065092087, "epoch": 0.00107, "frac_reward_zero_std": 0.0, "grad_norm": 0.1957595944404602, "kl": 0.3487290171906352, "learning_rate": 7.999998133621676e-06, "loss": -0.0595, "num_tokens": 2729025.0, "reward": 0.6405514478683472, "reward_std": 1.3021762371063232, "rewards/rollout_reward_func/mean": 0.6405514478683472, "rewards/rollout_reward_func/std": 1.3021762371063232, "sampling/importance_sampling_ratio/max": 1.6555594205856323, "sampling/importance_sampling_ratio/mean": 0.4450260102748871, "sampling/importance_sampling_ratio/min": 2.3846744023892086e-11, "sampling/sampling_logp_difference/max": 3.0279488563537598, "sampling/sampling_logp_difference/mean": 0.44383811950683594, "step": 107, "step_time": 22.226838742004475 }, { "clip_ratio/high_max": 0.005434782709926367, "clip_ratio/high_mean": 0.0027173913549631834, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0027173913549631834, "entropy": 2.2114066630601883, "epoch": 0.00108, "grad_norm": 0.12506748735904694, "kl": 0.36005968879908323, "learning_rate": 7.999998080677404e-06, "loss": -0.0601, "step": 108, "step_time": 10.919557692977833 }, { "clip_ratio/high_max": 0.01785714365541935, "clip_ratio/high_mean": 0.008928571827709675, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008928571827709675, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 6.34375, "completions/mean_terminated_length": 4.555555820465088, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.1420988254249096, "epoch": 0.00109, "frac_reward_zero_std": 0.0, "grad_norm": 0.09896285086870193, "kl": 0.4780236566439271, "learning_rate": 7.999998026992654e-06, "loss": -0.0891, "num_tokens": 2776380.0, "reward": 1.1582741737365723, "reward_std": 1.1057847738265991, "rewards/rollout_reward_func/mean": 1.1582741737365723, "rewards/rollout_reward_func/std": 1.1057847738265991, "sampling/importance_sampling_ratio/max": 2.4707834720611572, "sampling/importance_sampling_ratio/mean": 0.9127234220504761, "sampling/importance_sampling_ratio/min": 0.0001032143845804967, "sampling/sampling_logp_difference/max": 2.6152894496917725, "sampling/sampling_logp_difference/mean": 0.2569553852081299, "step": 109, "step_time": 21.539743177010678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1436380334198475, "epoch": 0.0011, "grad_norm": 0.11696452647447586, "kl": 0.49347666651010513, "learning_rate": 7.999997972567424e-06, "loss": -0.0893, "step": 110, "step_time": 11.303073190007126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.71875, "completions/mean_terminated_length": 5.392857551574707, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1928130388259888, "epoch": 0.00111, "frac_reward_zero_std": 0.0, "grad_norm": 0.20942629873752594, "kl": 0.3826353084295988, "learning_rate": 7.999997917401717e-06, "loss": -0.0949, "num_tokens": 2823041.0, "reward": 0.5845121145248413, "reward_std": 1.1711515188217163, "rewards/rollout_reward_func/mean": 0.5845121145248413, "rewards/rollout_reward_func/std": 1.1711513996124268, "sampling/importance_sampling_ratio/max": 1.8773937225341797, "sampling/importance_sampling_ratio/mean": 0.8766149282455444, "sampling/importance_sampling_ratio/min": 1.0210060708004676e-07, "sampling/sampling_logp_difference/max": 1.8776451349258423, "sampling/sampling_logp_difference/mean": 0.28544101119041443, "step": 111, "step_time": 20.481671999004902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.192817710340023, "epoch": 0.00112, "grad_norm": 0.22485382854938507, "kl": 0.38237693905830383, "learning_rate": 7.99999786149553e-06, "loss": -0.0957, "step": 112, "step_time": 12.321275013993727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.9375, "completions/mean_terminated_length": 4.84615421295166, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1486697122454643, "epoch": 0.00113, "frac_reward_zero_std": 0.0, "grad_norm": 0.08724779635667801, "kl": 0.5613560695201159, "learning_rate": 7.999997804848863e-06, "loss": -0.0728, "num_tokens": 2866509.0, "reward": 0.7315294146537781, "reward_std": 1.2914371490478516, "rewards/rollout_reward_func/mean": 0.7315294146537781, "rewards/rollout_reward_func/std": 1.2914371490478516, "sampling/importance_sampling_ratio/max": 1.858439564704895, "sampling/importance_sampling_ratio/mean": 0.9200788736343384, "sampling/importance_sampling_ratio/min": 0.00013421518087852746, "sampling/sampling_logp_difference/max": 2.088101625442505, "sampling/sampling_logp_difference/mean": 0.26403364539146423, "step": 113, "step_time": 18.501389472963638 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "entropy": 1.1479125656187534, "epoch": 0.00114, "grad_norm": 0.09234936535358429, "kl": 0.5605039298534393, "learning_rate": 7.999997747461717e-06, "loss": -0.0728, "step": 114, "step_time": 9.951440214019385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.625, "completions/mean_terminated_length": 6.428571701049805, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.4666100069880486, "epoch": 0.00115, "frac_reward_zero_std": 0.0, "grad_norm": 0.24085015058517456, "kl": 0.6343043092638254, "learning_rate": 7.999997689334094e-06, "loss": -0.062, "num_tokens": 2924084.0, "reward": 0.30507224798202515, "reward_std": 0.996490478515625, "rewards/rollout_reward_func/mean": 0.30507224798202515, "rewards/rollout_reward_func/std": 0.996490478515625, "sampling/importance_sampling_ratio/max": 1.7246466875076294, "sampling/importance_sampling_ratio/mean": 0.6592774391174316, "sampling/importance_sampling_ratio/min": 6.99673400959e-05, "sampling/sampling_logp_difference/max": 1.9244287014007568, "sampling/sampling_logp_difference/mean": 0.32180696725845337, "step": 115, "step_time": 21.271084987020004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.003289473708719015, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003289473708719015, "entropy": 1.4625906571745872, "epoch": 0.00116, "grad_norm": 0.22110579907894135, "kl": 0.6299438774585724, "learning_rate": 7.99999763046599e-06, "loss": -0.0623, "step": 116, "step_time": 11.240293783004745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.9375, "completions/mean_terminated_length": 6.076923370361328, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7396336160600185, "epoch": 0.00117, "frac_reward_zero_std": 0.0, "grad_norm": 0.13286778330802917, "kl": 0.424885050393641, "learning_rate": 7.999997570857409e-06, "loss": -0.0988, "num_tokens": 2976192.0, "reward": 0.6237162351608276, "reward_std": 1.1289615631103516, "rewards/rollout_reward_func/mean": 0.6237162351608276, "rewards/rollout_reward_func/std": 1.128961443901062, "sampling/importance_sampling_ratio/max": 1.894734501838684, "sampling/importance_sampling_ratio/mean": 0.7098982334136963, "sampling/importance_sampling_ratio/min": 0.0008977942052297294, "sampling/sampling_logp_difference/max": 2.0621492862701416, "sampling/sampling_logp_difference/mean": 0.2995651960372925, "step": 117, "step_time": 22.515888170019025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7330316044390202, "epoch": 0.00118, "grad_norm": 0.13176098465919495, "kl": 0.4063916215673089, "learning_rate": 7.999997510508348e-06, "loss": -0.0986, "step": 118, "step_time": 11.67736696600332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.53125, "completions/mean_terminated_length": 5.962963104248047, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.692598707973957, "epoch": 0.00119, "frac_reward_zero_std": 0.0, "grad_norm": 0.19013004004955292, "kl": 0.42042660154402256, "learning_rate": 7.999997449418809e-06, "loss": -0.0863, "num_tokens": 3027505.0, "reward": 0.5287418365478516, "reward_std": 1.1108042001724243, "rewards/rollout_reward_func/mean": 0.5287418365478516, "rewards/rollout_reward_func/std": 1.1108042001724243, "sampling/importance_sampling_ratio/max": 2.658508062362671, "sampling/importance_sampling_ratio/mean": 0.9330251216888428, "sampling/importance_sampling_ratio/min": 5.047020295023685e-06, "sampling/sampling_logp_difference/max": 2.039572238922119, "sampling/sampling_logp_difference/mean": 0.37275826930999756, "step": 119, "step_time": 21.052463858984993 }, { "clip_ratio/high_max": 0.0052083334885537624, "clip_ratio/high_mean": 0.004557291977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004557291977107525, "entropy": 1.6799028478562832, "epoch": 0.0012, "grad_norm": 0.17319054901599884, "kl": 0.4133539944887161, "learning_rate": 7.99999738758879e-06, "loss": -0.0872, "step": 120, "step_time": 11.191795385995647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.375, "completions/mean_terminated_length": 5.777777671813965, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.575472667813301, "epoch": 0.00121, "frac_reward_zero_std": 0.0, "grad_norm": 0.1861477792263031, "kl": 0.4234247598797083, "learning_rate": 7.999997325018293e-06, "loss": -0.1119, "num_tokens": 3072525.0, "reward": -0.03200504183769226, "reward_std": 1.1459941864013672, "rewards/rollout_reward_func/mean": -0.03200504183769226, "rewards/rollout_reward_func/std": 1.1459941864013672, "sampling/importance_sampling_ratio/max": 1.5631176233291626, "sampling/importance_sampling_ratio/mean": 0.7786738872528076, "sampling/importance_sampling_ratio/min": 2.656110154930502e-06, "sampling/sampling_logp_difference/max": 1.7508692741394043, "sampling/sampling_logp_difference/mean": 0.3421745300292969, "step": 121, "step_time": 20.63011691700376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.567276008427143, "epoch": 0.00122, "grad_norm": 0.18274982273578644, "kl": 0.4273998439311981, "learning_rate": 7.999997261707317e-06, "loss": -0.1125, "step": 122, "step_time": 11.695842054003151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.15625, "completions/mean_terminated_length": 6.703703880310059, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.8562293946743011, "epoch": 0.00123, "frac_reward_zero_std": 0.0, "grad_norm": 0.41229355335235596, "kl": 0.42591427080333233, "learning_rate": 7.999997197655861e-06, "loss": -0.0694, "num_tokens": 3130011.0, "reward": 0.16418711841106415, "reward_std": 1.0094380378723145, "rewards/rollout_reward_func/mean": 0.16418711841106415, "rewards/rollout_reward_func/std": 1.0094380378723145, "sampling/importance_sampling_ratio/max": 2.0013391971588135, "sampling/importance_sampling_ratio/mean": 0.6805577278137207, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.280221700668335, "sampling/sampling_logp_difference/mean": 0.3862370252609253, "step": 123, "step_time": 22.114514096989296 }, { "clip_ratio/high_max": 0.022664836142212152, "clip_ratio/high_mean": 0.011332418071106076, "clip_ratio/low_mean": 0.0069444444961845875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018276862567290664, "entropy": 1.85700723528862, "epoch": 0.00124, "grad_norm": 0.30091020464897156, "kl": 0.42807453870773315, "learning_rate": 7.999997132863928e-06, "loss": -0.0714, "step": 124, "step_time": 11.661921691993484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 8.375, "completions/mean_terminated_length": 5.3913044929504395, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6334304995834827, "epoch": 0.00125, "frac_reward_zero_std": 0.0, "grad_norm": 0.08052662014961243, "kl": 0.3621765002608299, "learning_rate": 7.999997067331516e-06, "loss": -0.0979, "num_tokens": 3174820.0, "reward": 0.487813800573349, "reward_std": 1.2952780723571777, "rewards/rollout_reward_func/mean": 0.487813800573349, "rewards/rollout_reward_func/std": 1.2952779531478882, "sampling/importance_sampling_ratio/max": 1.7639849185943604, "sampling/importance_sampling_ratio/mean": 0.7885684967041016, "sampling/importance_sampling_ratio/min": 1.8128528012084644e-08, "sampling/sampling_logp_difference/max": 2.847177505493164, "sampling/sampling_logp_difference/mean": 0.309546560049057, "step": 125, "step_time": 19.867469394986983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.636863574385643, "epoch": 0.00126, "grad_norm": 0.08659468591213226, "kl": 0.3638190384954214, "learning_rate": 7.999997001058625e-06, "loss": -0.0977, "step": 126, "step_time": 10.06527934600308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.625, "completions/mean_terminated_length": 5.692307949066162, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7896546721458435, "epoch": 0.00127, "frac_reward_zero_std": 0.0, "grad_norm": 0.23186390101909637, "kl": 0.4178910329937935, "learning_rate": 7.999996934045254e-06, "loss": -0.0619, "num_tokens": 3220693.0, "reward": 0.820522129535675, "reward_std": 1.1833999156951904, "rewards/rollout_reward_func/mean": 0.820522129535675, "rewards/rollout_reward_func/std": 1.1833999156951904, "sampling/importance_sampling_ratio/max": 1.583288311958313, "sampling/importance_sampling_ratio/mean": 0.8282482624053955, "sampling/importance_sampling_ratio/min": 1.1873769835801795e-05, "sampling/sampling_logp_difference/max": 2.125516414642334, "sampling/sampling_logp_difference/mean": 0.35265952348709106, "step": 127, "step_time": 22.12493246899976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7910342365503311, "epoch": 0.00128, "grad_norm": 0.21028649806976318, "kl": 0.41440989077091217, "learning_rate": 7.999996866291406e-06, "loss": -0.0629, "step": 128, "step_time": 11.366120402992237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.3125, "completions/mean_terminated_length": 4.4166669845581055, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5954458117485046, "epoch": 0.00129, "frac_reward_zero_std": 0.0, "grad_norm": 0.21080167591571808, "kl": 0.9126789011061192, "learning_rate": 7.999996797797079e-06, "loss": -0.0803, "num_tokens": 3273931.0, "reward": 0.2949213981628418, "reward_std": 1.0880231857299805, "rewards/rollout_reward_func/mean": 0.2949213981628418, "rewards/rollout_reward_func/std": 1.0880231857299805, "sampling/importance_sampling_ratio/max": 2.113299608230591, "sampling/importance_sampling_ratio/mean": 0.8794937133789062, "sampling/importance_sampling_ratio/min": 3.735787146297298e-08, "sampling/sampling_logp_difference/max": 2.290848970413208, "sampling/sampling_logp_difference/mean": 0.3740708529949188, "step": 129, "step_time": 21.2759643310128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.598283864557743, "epoch": 0.0013, "grad_norm": 0.2239302098751068, "kl": 0.884635541588068, "learning_rate": 7.999996728562273e-06, "loss": -0.0806, "step": 130, "step_time": 11.21450956899207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 8.6875, "completions/mean_terminated_length": 5.82608699798584, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.0085705369710922, "epoch": 0.00131, "frac_reward_zero_std": 0.0, "grad_norm": 0.2508632242679596, "kl": 0.9535766486078501, "learning_rate": 7.999996658586989e-06, "loss": -0.123, "num_tokens": 3323319.0, "reward": 0.521964967250824, "reward_std": 1.3060171604156494, "rewards/rollout_reward_func/mean": 0.521964967250824, "rewards/rollout_reward_func/std": 1.3060171604156494, "sampling/importance_sampling_ratio/max": 1.423848032951355, "sampling/importance_sampling_ratio/mean": 0.5842812061309814, "sampling/importance_sampling_ratio/min": 1.2092079487047158e-05, "sampling/sampling_logp_difference/max": 3.2051360607147217, "sampling/sampling_logp_difference/mean": 0.35123974084854126, "step": 131, "step_time": 21.934544182004174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0005175322294235, "epoch": 0.00132, "grad_norm": 0.2614805996417999, "kl": 0.9292410407215357, "learning_rate": 7.999996587871225e-06, "loss": -0.1247, "step": 132, "step_time": 11.134703274990898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.34375, "completions/mean_terminated_length": 6.199999809265137, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7094082087278366, "epoch": 0.00133, "frac_reward_zero_std": 0.0, "grad_norm": 0.1579096019268036, "kl": 0.32677530497312546, "learning_rate": 7.999996516414982e-06, "loss": -0.0751, "num_tokens": 3380290.0, "reward": 0.4767501950263977, "reward_std": 0.9661151170730591, "rewards/rollout_reward_func/mean": 0.4767501950263977, "rewards/rollout_reward_func/std": 0.9661151766777039, "sampling/importance_sampling_ratio/max": 1.8360772132873535, "sampling/importance_sampling_ratio/mean": 0.7746154069900513, "sampling/importance_sampling_ratio/min": 8.4931671153754e-06, "sampling/sampling_logp_difference/max": 2.404482364654541, "sampling/sampling_logp_difference/mean": 0.34264934062957764, "step": 133, "step_time": 21.75995328000863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7092161923646927, "epoch": 0.00134, "grad_norm": 0.15039220452308655, "kl": 0.34166786074638367, "learning_rate": 7.999996444218262e-06, "loss": -0.0757, "step": 134, "step_time": 11.651179352993495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.03125, "completions/mean_terminated_length": 5.863636493682861, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7919823080301285, "epoch": 0.00135, "frac_reward_zero_std": 0.0, "grad_norm": 0.07801967859268188, "kl": 0.2615721272304654, "learning_rate": 7.999996371281063e-06, "loss": -0.0956, "num_tokens": 3427723.0, "reward": 0.31112244725227356, "reward_std": 1.2876455783843994, "rewards/rollout_reward_func/mean": 0.31112244725227356, "rewards/rollout_reward_func/std": 1.2876455783843994, "sampling/importance_sampling_ratio/max": 1.6324596405029297, "sampling/importance_sampling_ratio/mean": 0.7616143226623535, "sampling/importance_sampling_ratio/min": 4.9060645324061625e-06, "sampling/sampling_logp_difference/max": 2.2283589839935303, "sampling/sampling_logp_difference/mean": 0.3171490430831909, "step": 135, "step_time": 20.965451587020652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7964462265372276, "epoch": 0.00136, "grad_norm": 0.07216472923755646, "kl": 0.26630106288939714, "learning_rate": 7.999996297603385e-06, "loss": -0.0959, "step": 136, "step_time": 11.58120112598408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.9375, "completions/mean_terminated_length": 5.259259223937988, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2809952925890684, "epoch": 0.00137, "frac_reward_zero_std": 0.0, "grad_norm": 0.10205793380737305, "kl": 0.560392078012228, "learning_rate": 7.999996223185228e-06, "loss": -0.1015, "num_tokens": 3479534.0, "reward": 1.0211572647094727, "reward_std": 1.1305036544799805, "rewards/rollout_reward_func/mean": 1.0211572647094727, "rewards/rollout_reward_func/std": 1.1305036544799805, "sampling/importance_sampling_ratio/max": 1.456481695175171, "sampling/importance_sampling_ratio/mean": 0.8241063356399536, "sampling/importance_sampling_ratio/min": 6.979147292440757e-05, "sampling/sampling_logp_difference/max": 1.9607086181640625, "sampling/sampling_logp_difference/mean": 0.30562272667884827, "step": 137, "step_time": 21.486620645984658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2725389637053013, "epoch": 0.00138, "grad_norm": 0.10026180744171143, "kl": 0.5938426852226257, "learning_rate": 7.999996148026594e-06, "loss": -0.1017, "step": 138, "step_time": 11.213373298989609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.15625, "completions/mean_terminated_length": 5.518518447875977, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3387949839234352, "epoch": 0.00139, "frac_reward_zero_std": 0.0, "grad_norm": 0.12909936904907227, "kl": 1.131907805800438, "learning_rate": 7.99999607212748e-06, "loss": -0.0539, "num_tokens": 3534657.0, "reward": 0.6488708257675171, "reward_std": 1.101752758026123, "rewards/rollout_reward_func/mean": 0.6488708257675171, "rewards/rollout_reward_func/std": 1.101752758026123, "sampling/importance_sampling_ratio/max": 1.95093834400177, "sampling/importance_sampling_ratio/mean": 0.8330919742584229, "sampling/importance_sampling_ratio/min": 5.238966878096107e-07, "sampling/sampling_logp_difference/max": 2.583253860473633, "sampling/sampling_logp_difference/mean": 0.3540854752063751, "step": 139, "step_time": 20.96289307299594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.330450400710106, "epoch": 0.0014, "grad_norm": 0.13054580986499786, "kl": 1.1747618168592453, "learning_rate": 7.999995995487888e-06, "loss": -0.0541, "step": 140, "step_time": 11.117017894983292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.46875, "completions/mean_terminated_length": 5.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2984952479600906, "epoch": 0.00141, "frac_reward_zero_std": 0.0, "grad_norm": 0.1414995640516281, "kl": 0.6836626082658768, "learning_rate": 7.999995918107818e-06, "loss": -0.0377, "num_tokens": 3580758.0, "reward": 0.6567355394363403, "reward_std": 1.3689754009246826, "rewards/rollout_reward_func/mean": 0.6567355394363403, "rewards/rollout_reward_func/std": 1.3689754009246826, "sampling/importance_sampling_ratio/max": 2.516300916671753, "sampling/importance_sampling_ratio/mean": 0.840164065361023, "sampling/importance_sampling_ratio/min": 2.7703712476068176e-05, "sampling/sampling_logp_difference/max": 2.446661949157715, "sampling/sampling_logp_difference/mean": 0.2989017963409424, "step": 141, "step_time": 19.791682238021167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.2985043227672577, "epoch": 0.00142, "grad_norm": 0.13360373675823212, "kl": 0.7065780460834503, "learning_rate": 7.999995839987269e-06, "loss": -0.0379, "step": 142, "step_time": 10.724026287993183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 7.65625, "completions/mean_terminated_length": 5.730769634246826, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.541421353816986, "epoch": 0.00143, "frac_reward_zero_std": 0.0, "grad_norm": 0.15680661797523499, "kl": 0.6196000501513481, "learning_rate": 7.999995761126243e-06, "loss": -0.0607, "num_tokens": 3630032.0, "reward": 0.9466111660003662, "reward_std": 1.030047059059143, "rewards/rollout_reward_func/mean": 0.9466111660003662, "rewards/rollout_reward_func/std": 1.030047059059143, "sampling/importance_sampling_ratio/max": 2.772519111633301, "sampling/importance_sampling_ratio/mean": 0.7602358460426331, "sampling/importance_sampling_ratio/min": 0.00010380033927503973, "sampling/sampling_logp_difference/max": 1.8305926322937012, "sampling/sampling_logp_difference/mean": 0.3315626382827759, "step": 143, "step_time": 21.056698056010646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.548307865858078, "epoch": 0.00144, "grad_norm": 0.14576569199562073, "kl": 0.621769867837429, "learning_rate": 7.999995681524736e-06, "loss": -0.0614, "step": 144, "step_time": 11.476674632984214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 5.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7554443329572678, "epoch": 0.00145, "frac_reward_zero_std": 0.0, "grad_norm": 0.2298838198184967, "kl": 0.4550134986639023, "learning_rate": 7.999995601182752e-06, "loss": -0.0534, "num_tokens": 3687805.0, "reward": 0.5898558497428894, "reward_std": 0.8533220291137695, "rewards/rollout_reward_func/mean": 0.5898558497428894, "rewards/rollout_reward_func/std": 0.8533220291137695, "sampling/importance_sampling_ratio/max": 2.2411112785339355, "sampling/importance_sampling_ratio/mean": 0.8067657947540283, "sampling/importance_sampling_ratio/min": 2.2581909320251725e-07, "sampling/sampling_logp_difference/max": 1.8404407501220703, "sampling/sampling_logp_difference/mean": 0.3644079864025116, "step": 145, "step_time": 22.1402535019879 }, { "clip_ratio/high_max": 0.009615384973585606, "clip_ratio/high_mean": 0.004807692486792803, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004807692486792803, "entropy": 1.7623437941074371, "epoch": 0.00146, "grad_norm": 0.22643956542015076, "kl": 0.4557917956262827, "learning_rate": 7.999995520100289e-06, "loss": -0.0543, "step": 146, "step_time": 12.174131527004647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.34375, "completions/mean_terminated_length": 4.964285850524902, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2221902944147587, "epoch": 0.00147, "frac_reward_zero_std": 0.0, "grad_norm": 0.0806538537144661, "kl": 0.5607532411813736, "learning_rate": 7.999995438277348e-06, "loss": -0.0824, "num_tokens": 3739202.0, "reward": 0.7657586336135864, "reward_std": 1.2090176343917847, "rewards/rollout_reward_func/mean": 0.7657586336135864, "rewards/rollout_reward_func/std": 1.2090176343917847, "sampling/importance_sampling_ratio/max": 1.8658699989318848, "sampling/importance_sampling_ratio/mean": 0.8183500170707703, "sampling/importance_sampling_ratio/min": 5.288336978992447e-05, "sampling/sampling_logp_difference/max": 1.704147219657898, "sampling/sampling_logp_difference/mean": 0.3000243902206421, "step": 147, "step_time": 21.3403275070159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2251424938440323, "epoch": 0.00148, "grad_norm": 0.07712127268314362, "kl": 0.5581982098519802, "learning_rate": 7.99999535571393e-06, "loss": -0.0825, "step": 148, "step_time": 10.942238521980471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.875, "completions/mean_terminated_length": 5.185185432434082, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2337633278220892, "epoch": 0.00149, "frac_reward_zero_std": 0.0, "grad_norm": 0.17388609051704407, "kl": 0.41050388291478157, "learning_rate": 7.999995272410032e-06, "loss": -0.0818, "num_tokens": 3795428.0, "reward": 1.0596129894256592, "reward_std": 0.9916526079177856, "rewards/rollout_reward_func/mean": 1.0596129894256592, "rewards/rollout_reward_func/std": 0.9916524887084961, "sampling/importance_sampling_ratio/max": 1.7136718034744263, "sampling/importance_sampling_ratio/mean": 0.910074770450592, "sampling/importance_sampling_ratio/min": 8.591478084518656e-10, "sampling/sampling_logp_difference/max": 2.380753517150879, "sampling/sampling_logp_difference/mean": 0.3599182963371277, "step": 149, "step_time": 20.448312833017553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.234990962781012, "epoch": 0.0015, "grad_norm": 0.17290690541267395, "kl": 0.4220624640583992, "learning_rate": 7.999995188365656e-06, "loss": -0.0817, "step": 150, "step_time": 10.619404577984824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 6.125, "completions/mean_terminated_length": 4.714285850524902, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.276489282026887, "epoch": 0.00151, "frac_reward_zero_std": 0.0, "grad_norm": 0.07217800617218018, "kl": 1.1948318351060152, "learning_rate": 7.999995103580802e-06, "loss": -0.1193, "num_tokens": 3835426.0, "reward": 1.563314437866211, "reward_std": 0.69316166639328, "rewards/rollout_reward_func/mean": 1.563314437866211, "rewards/rollout_reward_func/std": 0.6931616067886353, "sampling/importance_sampling_ratio/max": 2.3216681480407715, "sampling/importance_sampling_ratio/mean": 0.9455357789993286, "sampling/importance_sampling_ratio/min": 6.8838852484987e-07, "sampling/sampling_logp_difference/max": 3.2646658420562744, "sampling/sampling_logp_difference/mean": 0.3738369643688202, "step": 151, "step_time": 17.526803818007465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2794624827802181, "epoch": 0.00152, "grad_norm": 0.07193594425916672, "kl": 1.232732754200697, "learning_rate": 7.99999501805547e-06, "loss": -0.1193, "step": 152, "step_time": 9.61810389101447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.71875, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9785734470933676, "epoch": 0.00153, "frac_reward_zero_std": 0.0, "grad_norm": 0.10791199654340744, "kl": 0.43247023969888687, "learning_rate": 7.99999493178966e-06, "loss": -0.0758, "num_tokens": 3876288.0, "reward": 1.0473476648330688, "reward_std": 1.2428189516067505, "rewards/rollout_reward_func/mean": 1.0473476648330688, "rewards/rollout_reward_func/std": 1.2428189516067505, "sampling/importance_sampling_ratio/max": 1.8123661279678345, "sampling/importance_sampling_ratio/mean": 0.9307516813278198, "sampling/importance_sampling_ratio/min": 4.367517976788804e-05, "sampling/sampling_logp_difference/max": 1.8778085708618164, "sampling/sampling_logp_difference/mean": 0.2455531656742096, "step": 153, "step_time": 19.173180225028773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9773592948913574, "epoch": 0.00154, "grad_norm": 0.10895771533250809, "kl": 0.44276880845427513, "learning_rate": 7.99999484478337e-06, "loss": -0.076, "step": 154, "step_time": 10.450833533963305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.84375, "completions/mean_terminated_length": 5.559999942779541, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7625192292034626, "epoch": 0.00155, "frac_reward_zero_std": 0.0, "grad_norm": 0.07770274579524994, "kl": 0.3685985580086708, "learning_rate": 7.999994757036603e-06, "loss": -0.0883, "num_tokens": 3925501.0, "reward": 0.24695876240730286, "reward_std": 1.3677712678909302, "rewards/rollout_reward_func/mean": 0.24695876240730286, "rewards/rollout_reward_func/std": 1.3677712678909302, "sampling/importance_sampling_ratio/max": 1.8455129861831665, "sampling/importance_sampling_ratio/mean": 0.7289930582046509, "sampling/importance_sampling_ratio/min": 2.339752199986833e-06, "sampling/sampling_logp_difference/max": 3.157489776611328, "sampling/sampling_logp_difference/mean": 0.3824422359466553, "step": 155, "step_time": 21.451360543011106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7621786370873451, "epoch": 0.00156, "grad_norm": 0.07354626059532166, "kl": 0.3636279571801424, "learning_rate": 7.999994668549356e-06, "loss": -0.0886, "step": 156, "step_time": 11.681483882974135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 8.46875, "completions/mean_terminated_length": 5.045454502105713, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.1146973073482513, "epoch": 0.00157, "frac_reward_zero_std": 0.0, "grad_norm": 0.07180976867675781, "kl": 0.31312131881713867, "learning_rate": 7.999994579321633e-06, "loss": -0.0886, "num_tokens": 3977735.0, "reward": 0.21962633728981018, "reward_std": 1.124146580696106, "rewards/rollout_reward_func/mean": 0.21962633728981018, "rewards/rollout_reward_func/std": 1.1241464614868164, "sampling/importance_sampling_ratio/max": 1.5805552005767822, "sampling/importance_sampling_ratio/mean": 0.5679770708084106, "sampling/importance_sampling_ratio/min": 9.941945791069884e-07, "sampling/sampling_logp_difference/max": 2.5954976081848145, "sampling/sampling_logp_difference/mean": 0.4092082977294922, "step": 157, "step_time": 21.599761916004354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.1098042502999306, "epoch": 0.00158, "grad_norm": 0.06897258758544922, "kl": 0.3296934086829424, "learning_rate": 7.999994489353432e-06, "loss": -0.0887, "step": 158, "step_time": 11.14524479098327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.8125, "completions/mean_terminated_length": 5.519999980926514, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7016233652830124, "epoch": 0.00159, "frac_reward_zero_std": 0.0, "grad_norm": 0.1301664113998413, "kl": 0.9138250760734081, "learning_rate": 7.999994398644752e-06, "loss": -0.0562, "num_tokens": 4026521.0, "reward": 0.2103075087070465, "reward_std": 1.2027417421340942, "rewards/rollout_reward_func/mean": 0.2103075087070465, "rewards/rollout_reward_func/std": 1.2027418613433838, "sampling/importance_sampling_ratio/max": 1.5757702589035034, "sampling/importance_sampling_ratio/mean": 0.5794583559036255, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.730837345123291, "sampling/sampling_logp_difference/mean": 0.374149888753891, "step": 159, "step_time": 19.974272735984414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.70602148771286, "epoch": 0.0016, "grad_norm": 0.12616753578186035, "kl": 0.8705835975706577, "learning_rate": 7.999994307195594e-06, "loss": -0.0562, "step": 160, "step_time": 10.598780393003835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 6.0625, "completions/mean_terminated_length": 4.642857551574707, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.3262802734971046, "epoch": 0.00161, "frac_reward_zero_std": 0.0, "grad_norm": 0.20691506564617157, "kl": 1.4127982407808304, "learning_rate": 7.99999421500596e-06, "loss": -0.0544, "num_tokens": 4085075.0, "reward": 0.3002272844314575, "reward_std": 1.1122541427612305, "rewards/rollout_reward_func/mean": 0.3002272844314575, "rewards/rollout_reward_func/std": 1.112254023551941, "sampling/importance_sampling_ratio/max": 1.9417401552200317, "sampling/importance_sampling_ratio/mean": 0.7323096990585327, "sampling/importance_sampling_ratio/min": 2.61551103903912e-05, "sampling/sampling_logp_difference/max": 3.113964557647705, "sampling/sampling_logp_difference/mean": 0.3815222382545471, "step": 161, "step_time": 23.288370384005248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.330643080174923, "epoch": 0.00162, "grad_norm": 0.14323195815086365, "kl": 1.2819876661524177, "learning_rate": 7.999994122075845e-06, "loss": -0.055, "step": 162, "step_time": 11.519469935985398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 4.5714287757873535, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0164634753018618, "epoch": 0.00163, "frac_reward_zero_std": 0.0, "grad_norm": 0.10039446502923965, "kl": 0.4166795890778303, "learning_rate": 7.999994028405253e-06, "loss": -0.0874, "num_tokens": 4131805.0, "reward": 0.3626368045806885, "reward_std": 1.2388547658920288, "rewards/rollout_reward_func/mean": 0.3626368045806885, "rewards/rollout_reward_func/std": 1.2388547658920288, "sampling/importance_sampling_ratio/max": 1.5529922246932983, "sampling/importance_sampling_ratio/mean": 0.9061205983161926, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.048219919204712, "sampling/sampling_logp_difference/mean": 0.27181801199913025, "step": 163, "step_time": 19.89366294199135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0203178925439715, "epoch": 0.00164, "grad_norm": 0.0953485369682312, "kl": 0.4213708657771349, "learning_rate": 7.999993933994183e-06, "loss": -0.0875, "step": 164, "step_time": 10.959192903028452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.0625, "completions/mean_terminated_length": 4.559999942779541, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5890030805021524, "epoch": 0.00165, "frac_reward_zero_std": 0.0, "grad_norm": 0.12125381082296371, "kl": 0.46022286638617516, "learning_rate": 7.999993838842636e-06, "loss": -0.0706, "num_tokens": 4179448.0, "reward": 0.5324352979660034, "reward_std": 1.2901312112808228, "rewards/rollout_reward_func/mean": 0.5324352979660034, "rewards/rollout_reward_func/std": 1.2901310920715332, "sampling/importance_sampling_ratio/max": 1.6245338916778564, "sampling/importance_sampling_ratio/mean": 0.7563846707344055, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.5085909366607666, "sampling/sampling_logp_difference/mean": 0.3520182967185974, "step": 165, "step_time": 21.607127417970332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5929267648607492, "epoch": 0.00166, "grad_norm": 0.12013240903615952, "kl": 0.4577424619346857, "learning_rate": 7.99999374295061e-06, "loss": -0.0706, "step": 166, "step_time": 12.226068345014937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.375, "completions/mean_terminated_length": 4.592592716217041, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2036772277206182, "epoch": 0.00167, "frac_reward_zero_std": 0.0, "grad_norm": 0.10972525179386139, "kl": 0.651996735483408, "learning_rate": 7.999993646318106e-06, "loss": -0.0719, "num_tokens": 4236917.0, "reward": 0.7601081132888794, "reward_std": 1.173776626586914, "rewards/rollout_reward_func/mean": 0.7601081132888794, "rewards/rollout_reward_func/std": 1.173776626586914, "sampling/importance_sampling_ratio/max": 1.9727555513381958, "sampling/importance_sampling_ratio/mean": 0.936115026473999, "sampling/importance_sampling_ratio/min": 4.886679016635753e-05, "sampling/sampling_logp_difference/max": 1.8364224433898926, "sampling/sampling_logp_difference/mean": 0.31064707040786743, "step": 167, "step_time": 21.62405480099551 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "entropy": 1.2078653015196323, "epoch": 0.00168, "grad_norm": 0.10546831041574478, "kl": 0.6334488391876221, "learning_rate": 7.999993548945123e-06, "loss": -0.0722, "step": 168, "step_time": 11.215924975986127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.375, "completions/mean_terminated_length": 5.384615421295166, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.296417262405157, "epoch": 0.00169, "frac_reward_zero_std": 0.0, "grad_norm": 0.12372858822345734, "kl": 0.7008023653179407, "learning_rate": 7.999993450831664e-06, "loss": -0.0467, "num_tokens": 4287708.0, "reward": 0.5694789886474609, "reward_std": 1.2126119136810303, "rewards/rollout_reward_func/mean": 0.5694789886474609, "rewards/rollout_reward_func/std": 1.2126119136810303, "sampling/importance_sampling_ratio/max": 1.9062552452087402, "sampling/importance_sampling_ratio/mean": 0.7944549322128296, "sampling/importance_sampling_ratio/min": 3.8037446756788995e-06, "sampling/sampling_logp_difference/max": 2.0204787254333496, "sampling/sampling_logp_difference/mean": 0.3087497353553772, "step": 169, "step_time": 21.639665668000816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2975483387708664, "epoch": 0.0017, "grad_norm": 0.1279054433107376, "kl": 0.6694894023239613, "learning_rate": 7.999993351977727e-06, "loss": -0.0467, "step": 170, "step_time": 11.560545120984898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.96875, "completions/mean_terminated_length": 5.2916669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5979301109910011, "epoch": 0.00171, "frac_reward_zero_std": 0.0, "grad_norm": 0.18571361899375916, "kl": 0.2951988950371742, "learning_rate": 7.999993252383311e-06, "loss": -0.0964, "num_tokens": 4341974.0, "reward": 0.6059678196907043, "reward_std": 1.2076282501220703, "rewards/rollout_reward_func/mean": 0.6059678196907043, "rewards/rollout_reward_func/std": 1.2076282501220703, "sampling/importance_sampling_ratio/max": 2.081467628479004, "sampling/importance_sampling_ratio/mean": 0.8608572483062744, "sampling/importance_sampling_ratio/min": 9.621501249057474e-07, "sampling/sampling_logp_difference/max": 2.8231711387634277, "sampling/sampling_logp_difference/mean": 0.3308892846107483, "step": 171, "step_time": 22.35453252303705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.597814179956913, "epoch": 0.00172, "grad_norm": 0.18874627351760864, "kl": 0.2849695347249508, "learning_rate": 7.999993152048418e-06, "loss": -0.0972, "step": 172, "step_time": 11.098467136995168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.875, "completions/mean_terminated_length": 5.185185432434082, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2360386531800032, "epoch": 0.00173, "frac_reward_zero_std": 0.25, "grad_norm": 0.14665324985980988, "kl": 0.41459524910897017, "learning_rate": 7.999993050973047e-06, "loss": -0.0779, "num_tokens": 4385110.0, "reward": 1.18587064743042, "reward_std": 1.210247278213501, "rewards/rollout_reward_func/mean": 1.18587064743042, "rewards/rollout_reward_func/std": 1.210247278213501, "sampling/importance_sampling_ratio/max": 1.7010729312896729, "sampling/importance_sampling_ratio/mean": 0.9227437376976013, "sampling/importance_sampling_ratio/min": 1.242291176595245e-07, "sampling/sampling_logp_difference/max": 1.846907377243042, "sampling/sampling_logp_difference/mean": 0.27090322971343994, "step": 173, "step_time": 19.718493736014352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.2368275225162506, "epoch": 0.00174, "grad_norm": 0.15358339250087738, "kl": 0.4282354023307562, "learning_rate": 7.999992949157197e-06, "loss": -0.0777, "step": 174, "step_time": 10.850172510996344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 9.03125, "completions/mean_terminated_length": 5.863636493682861, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.4054027795791626, "epoch": 0.00175, "frac_reward_zero_std": 0.0, "grad_norm": 0.24111555516719818, "kl": 0.4764941278845072, "learning_rate": 7.999992846600872e-06, "loss": -0.061, "num_tokens": 4432800.0, "reward": 0.3970183730125427, "reward_std": 1.1446133852005005, "rewards/rollout_reward_func/mean": 0.3970183730125427, "rewards/rollout_reward_func/std": 1.14461350440979, "sampling/importance_sampling_ratio/max": 1.625251293182373, "sampling/importance_sampling_ratio/mean": 0.5627062916755676, "sampling/importance_sampling_ratio/min": 1.070409325620858e-07, "sampling/sampling_logp_difference/max": 2.71304988861084, "sampling/sampling_logp_difference/mean": 0.4490213096141815, "step": 175, "step_time": 22.45902702798776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 2.4050710387527943, "epoch": 0.00176, "grad_norm": 0.2502260208129883, "kl": 0.5289854176808149, "learning_rate": 7.999992743304069e-06, "loss": -0.0614, "step": 176, "step_time": 11.17968871096673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.46875, "completions/mean_terminated_length": 5.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.271098107099533, "epoch": 0.00177, "frac_reward_zero_std": 0.0, "grad_norm": 0.17016665637493134, "kl": 0.20891800802201033, "learning_rate": 7.999992639266786e-06, "loss": -0.0731, "num_tokens": 4478351.0, "reward": 0.36555179953575134, "reward_std": 1.2928783893585205, "rewards/rollout_reward_func/mean": 0.36555179953575134, "rewards/rollout_reward_func/std": 1.292878270149231, "sampling/importance_sampling_ratio/max": 1.3605849742889404, "sampling/importance_sampling_ratio/mean": 0.6326606273651123, "sampling/importance_sampling_ratio/min": 3.5956538795289816e-06, "sampling/sampling_logp_difference/max": 2.01981782913208, "sampling/sampling_logp_difference/mean": 0.3722599744796753, "step": 177, "step_time": 20.088329217003775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.2711285650730133, "epoch": 0.00178, "grad_norm": 0.1646575629711151, "kl": 0.220208958722651, "learning_rate": 7.999992534489026e-06, "loss": -0.0734, "step": 178, "step_time": 10.288163563993294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.34375, "completions/mean_terminated_length": 4.863636493682861, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6274608075618744, "epoch": 0.00179, "frac_reward_zero_std": 0.0, "grad_norm": 0.21402722597122192, "kl": 0.8836196213960648, "learning_rate": 7.99999242897079e-06, "loss": -0.0789, "num_tokens": 4529864.0, "reward": 0.5083537697792053, "reward_std": 1.301440715789795, "rewards/rollout_reward_func/mean": 0.5083537697792053, "rewards/rollout_reward_func/std": 1.3014405965805054, "sampling/importance_sampling_ratio/max": 1.4606221914291382, "sampling/importance_sampling_ratio/mean": 0.6534599661827087, "sampling/importance_sampling_ratio/min": 2.734494410105981e-05, "sampling/sampling_logp_difference/max": 2.246497631072998, "sampling/sampling_logp_difference/mean": 0.2952132225036621, "step": 179, "step_time": 21.362192832981236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0069444444961845875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "entropy": 1.6288563925772905, "epoch": 0.0018, "grad_norm": 0.16288624703884125, "kl": 0.838687508367002, "learning_rate": 7.999992322712075e-06, "loss": -0.08, "step": 180, "step_time": 11.639183878025506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.3125, "completions/mean_terminated_length": 4.928571701049805, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.3644575625658035, "epoch": 0.00181, "frac_reward_zero_std": 0.0, "grad_norm": 0.09831807762384415, "kl": 0.8359664939343929, "learning_rate": 7.999992215712882e-06, "loss": -0.1032, "num_tokens": 4575051.0, "reward": 1.0530773401260376, "reward_std": 1.1790317296981812, "rewards/rollout_reward_func/mean": 1.0530773401260376, "rewards/rollout_reward_func/std": 1.1790316104888916, "sampling/importance_sampling_ratio/max": 2.0898053646087646, "sampling/importance_sampling_ratio/mean": 0.8362575769424438, "sampling/importance_sampling_ratio/min": 0.00012495004921220243, "sampling/sampling_logp_difference/max": 1.8993983268737793, "sampling/sampling_logp_difference/mean": 0.3181629180908203, "step": 181, "step_time": 22.129774838031153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.370016187429428, "epoch": 0.00182, "grad_norm": 0.10233374685049057, "kl": 0.793476939201355, "learning_rate": 7.999992107973214e-06, "loss": -0.1031, "step": 182, "step_time": 11.508060280990321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.59375, "completions/mean_terminated_length": 4.7916669845581055, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.665930300951004, "epoch": 0.00183, "frac_reward_zero_std": 0.0, "grad_norm": 0.17853480577468872, "kl": 0.2981446851044893, "learning_rate": 7.999991999493065e-06, "loss": -0.071, "num_tokens": 4629586.0, "reward": 0.04798552393913269, "reward_std": 0.9084416031837463, "rewards/rollout_reward_func/mean": 0.04798552393913269, "rewards/rollout_reward_func/std": 0.9084416031837463, "sampling/importance_sampling_ratio/max": 1.5080032348632812, "sampling/importance_sampling_ratio/mean": 0.7914587259292603, "sampling/importance_sampling_ratio/min": 1.7173235391965136e-05, "sampling/sampling_logp_difference/max": 2.3812856674194336, "sampling/sampling_logp_difference/mean": 0.3300588130950928, "step": 183, "step_time": 21.699108545013587 }, { "clip_ratio/high_max": 0.0062500000931322575, "clip_ratio/high_mean": 0.0031250000465661287, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 1.666107501834631, "epoch": 0.00184, "grad_norm": 0.17845360934734344, "kl": 0.27919645700603724, "learning_rate": 7.999991890272441e-06, "loss": -0.0712, "step": 184, "step_time": 11.606294795987196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.78125, "completions/mean_terminated_length": 5.464285850524902, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2058645663782954, "epoch": 0.00185, "frac_reward_zero_std": 0.0, "grad_norm": 0.08534277975559235, "kl": 0.558193476870656, "learning_rate": 7.999991780311339e-06, "loss": -0.0758, "num_tokens": 4681108.0, "reward": 1.2702393531799316, "reward_std": 1.0490679740905762, "rewards/rollout_reward_func/mean": 1.2702393531799316, "rewards/rollout_reward_func/std": 1.0490679740905762, "sampling/importance_sampling_ratio/max": 1.4299242496490479, "sampling/importance_sampling_ratio/mean": 0.8936188817024231, "sampling/importance_sampling_ratio/min": 2.7146882075612666e-06, "sampling/sampling_logp_difference/max": 1.906693458557129, "sampling/sampling_logp_difference/mean": 0.31461894512176514, "step": 185, "step_time": 19.91892927599838 }, { "clip_ratio/high_max": 0.005681818351149559, "clip_ratio/high_mean": 0.0028409091755747795, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028409091755747795, "entropy": 1.2069701487198472, "epoch": 0.00186, "grad_norm": 0.05075041949748993, "kl": 0.5161408502608538, "learning_rate": 7.999991669609758e-06, "loss": -0.076, "step": 186, "step_time": 10.710335806987132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 7.71875, "completions/mean_terminated_length": 5.807692527770996, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9156082831323147, "epoch": 0.00187, "frac_reward_zero_std": 0.0, "grad_norm": 0.06372180581092834, "kl": 0.9690718594938517, "learning_rate": 7.999991558167702e-06, "loss": -0.1037, "num_tokens": 4729736.0, "reward": 0.7530470490455627, "reward_std": 1.3728065490722656, "rewards/rollout_reward_func/mean": 0.7530470490455627, "rewards/rollout_reward_func/std": 1.3728065490722656, "sampling/importance_sampling_ratio/max": 1.9222583770751953, "sampling/importance_sampling_ratio/mean": 0.6061673164367676, "sampling/importance_sampling_ratio/min": 7.369852141891897e-07, "sampling/sampling_logp_difference/max": 2.2485601902008057, "sampling/sampling_logp_difference/mean": 0.3940808176994324, "step": 187, "step_time": 20.350479193002684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9093354986980557, "epoch": 0.00188, "grad_norm": 0.06304702162742615, "kl": 0.941553695127368, "learning_rate": 7.999991445985168e-06, "loss": -0.1036, "step": 188, "step_time": 10.510422511026263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 11.1875, "completions/mean_terminated_length": 7.44444465637207, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.4662718325853348, "epoch": 0.00189, "frac_reward_zero_std": 0.0, "grad_norm": 0.12859977781772614, "kl": 0.24574382859282196, "learning_rate": 7.999991333062156e-06, "loss": -0.0989, "num_tokens": 4783422.0, "reward": 0.01914658024907112, "reward_std": 1.1484752893447876, "rewards/rollout_reward_func/mean": 0.01914658024907112, "rewards/rollout_reward_func/std": 1.1484752893447876, "sampling/importance_sampling_ratio/max": 1.9478083848953247, "sampling/importance_sampling_ratio/mean": 0.4486072063446045, "sampling/importance_sampling_ratio/min": 9.261531772608578e-07, "sampling/sampling_logp_difference/max": 2.9413723945617676, "sampling/sampling_logp_difference/mean": 0.3834630250930786, "step": 189, "step_time": 21.62897611896915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.4673528522253036, "epoch": 0.0019, "grad_norm": 0.12888100743293762, "kl": 0.2442969218827784, "learning_rate": 7.999991219398665e-06, "loss": -0.099, "step": 190, "step_time": 11.070569811025052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 10.28125, "completions/mean_terminated_length": 6.849999904632568, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9080400727689266, "epoch": 0.00191, "frac_reward_zero_std": 0.0, "grad_norm": 0.13169635832309723, "kl": 0.46794734988361597, "learning_rate": 7.999991104994699e-06, "loss": -0.0711, "num_tokens": 4838020.0, "reward": 0.13989301025867462, "reward_std": 1.1185712814331055, "rewards/rollout_reward_func/mean": 0.13989301025867462, "rewards/rollout_reward_func/std": 1.118571162223816, "sampling/importance_sampling_ratio/max": 2.1338083744049072, "sampling/importance_sampling_ratio/mean": 0.5666083097457886, "sampling/importance_sampling_ratio/min": 8.726250655399781e-08, "sampling/sampling_logp_difference/max": 2.5437722206115723, "sampling/sampling_logp_difference/mean": 0.336800754070282, "step": 191, "step_time": 22.41523110201524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9108029641211033, "epoch": 0.00192, "grad_norm": 0.12969467043876648, "kl": 0.4392039319500327, "learning_rate": 7.999990989850255e-06, "loss": -0.0716, "step": 192, "step_time": 10.669991740986006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.15625, "completions/mean_terminated_length": 5.5714287757873535, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7130211889743805, "epoch": 0.00193, "frac_reward_zero_std": 0.0, "grad_norm": 0.11029750108718872, "kl": 0.33105778601020575, "learning_rate": 7.999990873965335e-06, "loss": -0.1133, "num_tokens": 4895730.0, "reward": 0.3873368799686432, "reward_std": 1.3111690282821655, "rewards/rollout_reward_func/mean": 0.3873368799686432, "rewards/rollout_reward_func/std": 1.3111690282821655, "sampling/importance_sampling_ratio/max": 1.8325276374816895, "sampling/importance_sampling_ratio/mean": 0.6509191989898682, "sampling/importance_sampling_ratio/min": 0.0012903152965009212, "sampling/sampling_logp_difference/max": 2.2715752124786377, "sampling/sampling_logp_difference/mean": 0.2812280058860779, "step": 193, "step_time": 28.689538025995716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7159255892038345, "epoch": 0.00194, "grad_norm": 0.11068301647901535, "kl": 0.33950769854709506, "learning_rate": 7.999990757339936e-06, "loss": -0.1131, "step": 194, "step_time": 14.167606331015122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 9.3125, "completions/mean_terminated_length": 5.809524059295654, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8397020772099495, "epoch": 0.00195, "frac_reward_zero_std": 0.0, "grad_norm": 0.13507214188575745, "kl": 0.6099705509841442, "learning_rate": 7.99999063997406e-06, "loss": -0.098, "num_tokens": 4951498.0, "reward": 0.14388054609298706, "reward_std": 1.2666056156158447, "rewards/rollout_reward_func/mean": 0.14388054609298706, "rewards/rollout_reward_func/std": 1.2666056156158447, "sampling/importance_sampling_ratio/max": 1.6731853485107422, "sampling/importance_sampling_ratio/mean": 0.550056517124176, "sampling/importance_sampling_ratio/min": 0.00018688052659854293, "sampling/sampling_logp_difference/max": 2.5550010204315186, "sampling/sampling_logp_difference/mean": 0.30930131673812866, "step": 195, "step_time": 27.6903876580036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.839954748749733, "epoch": 0.00196, "grad_norm": 0.1249907985329628, "kl": 0.6060127820819616, "learning_rate": 7.999990521867708e-06, "loss": -0.0983, "step": 196, "step_time": 13.465109190990916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 9.59375, "completions/mean_terminated_length": 5.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9674859941005707, "epoch": 0.00197, "frac_reward_zero_std": 0.0, "grad_norm": 0.16492624580860138, "kl": 0.8138342024758458, "learning_rate": 7.999990403020877e-06, "loss": -0.0417, "num_tokens": 5009848.0, "reward": 0.14502963423728943, "reward_std": 1.1383486986160278, "rewards/rollout_reward_func/mean": 0.14502963423728943, "rewards/rollout_reward_func/std": 1.1383486986160278, "sampling/importance_sampling_ratio/max": 1.8559691905975342, "sampling/importance_sampling_ratio/mean": 0.5648907423019409, "sampling/importance_sampling_ratio/min": 8.34078207390121e-07, "sampling/sampling_logp_difference/max": 2.063784122467041, "sampling/sampling_logp_difference/mean": 0.37575647234916687, "step": 197, "step_time": 25.405276418983703 }, { "clip_ratio/high_max": 0.019323671702295542, "clip_ratio/high_mean": 0.014870169339701533, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014870169339701533, "entropy": 1.9684479013085365, "epoch": 0.00198, "grad_norm": 0.12929660081863403, "kl": 0.7374402433633804, "learning_rate": 7.99999028343357e-06, "loss": -0.0421, "step": 198, "step_time": 12.429772619027062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 8.75, "completions/mean_terminated_length": 5.454545497894287, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8442481867969036, "epoch": 0.00199, "frac_reward_zero_std": 0.0, "grad_norm": 0.15357711911201477, "kl": 1.3698142636567354, "learning_rate": 7.999990163105786e-06, "loss": -0.0755, "num_tokens": 5058993.0, "reward": 0.7026046514511108, "reward_std": 1.2940713167190552, "rewards/rollout_reward_func/mean": 0.7026046514511108, "rewards/rollout_reward_func/std": 1.2940711975097656, "sampling/importance_sampling_ratio/max": 2.0585389137268066, "sampling/importance_sampling_ratio/mean": 0.6612690687179565, "sampling/importance_sampling_ratio/min": 0.00027871871134266257, "sampling/sampling_logp_difference/max": 1.8184701204299927, "sampling/sampling_logp_difference/mean": 0.30622729659080505, "step": 199, "step_time": 26.269248675002018 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 1.8534026499837637, "epoch": 0.002, "grad_norm": 0.12622886896133423, "kl": 1.3658232372254133, "learning_rate": 7.999990042037526e-06, "loss": -0.076, "step": 200, "step_time": 12.599083649998647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.96875, "completions/mean_terminated_length": 5.2916669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6455742940306664, "epoch": 0.00201, "frac_reward_zero_std": 0.0, "grad_norm": 0.11873354017734528, "kl": 0.25516448356211185, "learning_rate": 7.999989920228787e-06, "loss": -0.1025, "num_tokens": 5104099.0, "reward": 1.1065576076507568, "reward_std": 1.2735867500305176, "rewards/rollout_reward_func/mean": 1.1065576076507568, "rewards/rollout_reward_func/std": 1.273586630821228, "sampling/importance_sampling_ratio/max": 1.890783667564392, "sampling/importance_sampling_ratio/mean": 0.7498313188552856, "sampling/importance_sampling_ratio/min": 5.1762553994194604e-06, "sampling/sampling_logp_difference/max": 2.0034890174865723, "sampling/sampling_logp_difference/mean": 0.3084884285926819, "step": 201, "step_time": 20.93930995799019 }, { "clip_ratio/high_max": 0.032620614394545555, "clip_ratio/high_mean": 0.016310307197272778, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.016310307197272778, "entropy": 1.653382495045662, "epoch": 0.00202, "grad_norm": 0.04651179909706116, "kl": 0.27287874184548855, "learning_rate": 7.999989797679573e-06, "loss": -0.1027, "step": 202, "step_time": 10.461195481009781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.6875, "completions/mean_terminated_length": 6.38095235824585, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9994595348834991, "epoch": 0.00203, "frac_reward_zero_std": 0.0, "grad_norm": 0.15202419459819794, "kl": 0.27893034368753433, "learning_rate": 7.99998967438988e-06, "loss": -0.0729, "num_tokens": 5160838.0, "reward": 0.6613981127738953, "reward_std": 1.1912357807159424, "rewards/rollout_reward_func/mean": 0.6613981127738953, "rewards/rollout_reward_func/std": 1.1912357807159424, "sampling/importance_sampling_ratio/max": 1.3248381614685059, "sampling/importance_sampling_ratio/mean": 0.49232274293899536, "sampling/importance_sampling_ratio/min": 4.4001627230727536e-08, "sampling/sampling_logp_difference/max": 2.5078654289245605, "sampling/sampling_logp_difference/mean": 0.3786037862300873, "step": 203, "step_time": 26.554127166993567 }, { "clip_ratio/high_max": 0.033238636795431376, "clip_ratio/high_mean": 0.021427010418847203, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.021427010418847203, "entropy": 2.009207561612129, "epoch": 0.00204, "grad_norm": 0.12353045493364334, "kl": 0.2725708717480302, "learning_rate": 7.999989550359713e-06, "loss": -0.0739, "step": 204, "step_time": 13.424865395005327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.78125, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7938257530331612, "epoch": 0.00205, "frac_reward_zero_std": 0.0, "grad_norm": 0.08711834996938705, "kl": 0.37914965907111764, "learning_rate": 7.999989425589066e-06, "loss": -0.1226, "num_tokens": 5212708.0, "reward": 0.6457921266555786, "reward_std": 1.2949131727218628, "rewards/rollout_reward_func/mean": 0.6457921266555786, "rewards/rollout_reward_func/std": 1.2949132919311523, "sampling/importance_sampling_ratio/max": 1.9661786556243896, "sampling/importance_sampling_ratio/mean": 0.6987441182136536, "sampling/importance_sampling_ratio/min": 9.487960596743505e-06, "sampling/sampling_logp_difference/max": 1.9042608737945557, "sampling/sampling_logp_difference/mean": 0.32265323400497437, "step": 205, "step_time": 25.50105870400148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8006295263767242, "epoch": 0.00206, "grad_norm": 0.08555116504430771, "kl": 0.36498567182570696, "learning_rate": 7.999989300077943e-06, "loss": -0.1226, "step": 206, "step_time": 13.097384873006376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.90625, "completions/mean_terminated_length": 5.222222328186035, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4339219462126493, "epoch": 0.00207, "frac_reward_zero_std": 0.0, "grad_norm": 0.10898369550704956, "kl": 0.4447516780346632, "learning_rate": 7.999989173826344e-06, "loss": -0.0797, "num_tokens": 5263090.0, "reward": 0.322836697101593, "reward_std": 1.2032971382141113, "rewards/rollout_reward_func/mean": 0.322836697101593, "rewards/rollout_reward_func/std": 1.2032970190048218, "sampling/importance_sampling_ratio/max": 1.596300721168518, "sampling/importance_sampling_ratio/mean": 0.8081009387969971, "sampling/importance_sampling_ratio/min": 1.5315734344767407e-05, "sampling/sampling_logp_difference/max": 1.9641528129577637, "sampling/sampling_logp_difference/mean": 0.30718573927879333, "step": 207, "step_time": 23.787231084963423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 1.434356577694416, "epoch": 0.00208, "grad_norm": 0.1140708178281784, "kl": 0.46255506575107574, "learning_rate": 7.999989046834267e-06, "loss": -0.0795, "step": 208, "step_time": 12.079161959001794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.1875, "completions/mean_terminated_length": 4.785714626312256, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.241585124284029, "epoch": 0.00209, "frac_reward_zero_std": 0.0, "grad_norm": 0.18340744078159332, "kl": 0.7430628454312682, "learning_rate": 7.999988919101714e-06, "loss": -0.0671, "num_tokens": 5307050.0, "reward": 0.8764724135398865, "reward_std": 1.278511881828308, "rewards/rollout_reward_func/mean": 0.8764724135398865, "rewards/rollout_reward_func/std": 1.2785117626190186, "sampling/importance_sampling_ratio/max": 1.4836621284484863, "sampling/importance_sampling_ratio/mean": 0.8401221632957458, "sampling/importance_sampling_ratio/min": 2.9616021492984146e-06, "sampling/sampling_logp_difference/max": 2.6617276668548584, "sampling/sampling_logp_difference/mean": 0.27872052788734436, "step": 209, "step_time": 19.833821656997316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2398783676326275, "epoch": 0.0021, "grad_norm": 0.1832769215106964, "kl": 0.730752938427031, "learning_rate": 7.999988790628685e-06, "loss": -0.0674, "step": 210, "step_time": 10.764782503989409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.59375, "completions/mean_terminated_length": 5.239999771118164, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5271094664931297, "epoch": 0.00211, "frac_reward_zero_std": 0.0, "grad_norm": 0.08936210721731186, "kl": 0.3335033915936947, "learning_rate": 7.999988661415179e-06, "loss": -0.0797, "num_tokens": 5359107.0, "reward": 0.8697794675827026, "reward_std": 1.2812761068344116, "rewards/rollout_reward_func/mean": 0.8697794675827026, "rewards/rollout_reward_func/std": 1.2812761068344116, "sampling/importance_sampling_ratio/max": 1.8925718069076538, "sampling/importance_sampling_ratio/mean": 0.7257797718048096, "sampling/importance_sampling_ratio/min": 0.00029899273067712784, "sampling/sampling_logp_difference/max": 2.1850433349609375, "sampling/sampling_logp_difference/mean": 0.26198166608810425, "step": 211, "step_time": 25.308442315013963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.536928329616785, "epoch": 0.00212, "grad_norm": 0.09325359761714935, "kl": 0.3446014914661646, "learning_rate": 7.999988531461196e-06, "loss": -0.08, "step": 212, "step_time": 12.688327308991575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.1875, "completions/mean_terminated_length": 5.153846263885498, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7003738954663277, "epoch": 0.00213, "frac_reward_zero_std": 0.25, "grad_norm": 0.085854172706604, "kl": 0.3431752137839794, "learning_rate": 7.999988400766736e-06, "loss": -0.0514, "num_tokens": 5411556.0, "reward": 0.4366152584552765, "reward_std": 1.3325790166854858, "rewards/rollout_reward_func/mean": 0.4366152584552765, "rewards/rollout_reward_func/std": 1.3325788974761963, "sampling/importance_sampling_ratio/max": 2.4605116844177246, "sampling/importance_sampling_ratio/mean": 0.790947675704956, "sampling/importance_sampling_ratio/min": 6.012024300616758e-07, "sampling/sampling_logp_difference/max": 1.9983617067337036, "sampling/sampling_logp_difference/mean": 0.35615837574005127, "step": 213, "step_time": 25.06784961798985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.697919063270092, "epoch": 0.00214, "grad_norm": 0.08640997856855392, "kl": 0.3233846742659807, "learning_rate": 7.999988269331798e-06, "loss": -0.0514, "step": 214, "step_time": 13.531197736025206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.78125, "completions/mean_terminated_length": 4.724137783050537, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1452371813356876, "epoch": 0.00215, "frac_reward_zero_std": 0.0, "grad_norm": 0.135212704539299, "kl": 0.9285635277628899, "learning_rate": 7.999988137156384e-06, "loss": -0.1011, "num_tokens": 5458666.0, "reward": 0.542100191116333, "reward_std": 1.3382775783538818, "rewards/rollout_reward_func/mean": 0.542100191116333, "rewards/rollout_reward_func/std": 1.3382775783538818, "sampling/importance_sampling_ratio/max": 1.9670984745025635, "sampling/importance_sampling_ratio/mean": 0.7484497427940369, "sampling/importance_sampling_ratio/min": 8.469100087893366e-09, "sampling/sampling_logp_difference/max": 2.34106707572937, "sampling/sampling_logp_difference/mean": 0.31486567854881287, "step": 215, "step_time": 26.230090134980856 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "entropy": 1.1404877342283726, "epoch": 0.00216, "grad_norm": 0.13892637193202972, "kl": 0.9345705024898052, "learning_rate": 7.999988004240496e-06, "loss": -0.1011, "step": 216, "step_time": 13.592465693989652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 6.1875, "completions/mean_terminated_length": 4.370370388031006, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.143936550244689, "epoch": 0.00217, "frac_reward_zero_std": 0.0, "grad_norm": 0.2528379559516907, "kl": 0.8294559940695763, "learning_rate": 7.99998787058413e-06, "loss": -0.0968, "num_tokens": 5501944.0, "reward": 0.7925770282745361, "reward_std": 1.3992630243301392, "rewards/rollout_reward_func/mean": 0.7925770282745361, "rewards/rollout_reward_func/std": 1.3992630243301392, "sampling/importance_sampling_ratio/max": 1.3051011562347412, "sampling/importance_sampling_ratio/mean": 0.7969761490821838, "sampling/importance_sampling_ratio/min": 0.0008550977800041437, "sampling/sampling_logp_difference/max": 1.655207633972168, "sampling/sampling_logp_difference/mean": 0.20095223188400269, "step": 217, "step_time": 19.81544334902719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1341043077409267, "epoch": 0.00218, "grad_norm": 0.25273585319519043, "kl": 0.9023497253656387, "learning_rate": 7.999987736187286e-06, "loss": -0.0975, "step": 218, "step_time": 10.32065845298348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 7.28125, "completions/mean_terminated_length": 4.839999675750732, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5058955401182175, "epoch": 0.00219, "frac_reward_zero_std": 0.0, "grad_norm": 0.08095620572566986, "kl": 0.480750082526356, "learning_rate": 7.999987601049967e-06, "loss": -0.0614, "num_tokens": 5564448.0, "reward": 0.4847296476364136, "reward_std": 1.1951870918273926, "rewards/rollout_reward_func/mean": 0.4847296476364136, "rewards/rollout_reward_func/std": 1.1951872110366821, "sampling/importance_sampling_ratio/max": 1.788336157798767, "sampling/importance_sampling_ratio/mean": 0.6999871730804443, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.1111512184143066, "sampling/sampling_logp_difference/mean": 0.3197779953479767, "step": 219, "step_time": 26.420712380015175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.504178497940302, "epoch": 0.0022, "grad_norm": 0.08203668892383575, "kl": 0.5084892301820219, "learning_rate": 7.99998746517217e-06, "loss": -0.0615, "step": 220, "step_time": 15.331492324970895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 7.03125, "completions/mean_terminated_length": 4.961538791656494, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2873619832098484, "epoch": 0.00221, "frac_reward_zero_std": 0.0, "grad_norm": 0.15117080509662628, "kl": 0.5615226496011019, "learning_rate": 7.999987328553901e-06, "loss": -0.0927, "num_tokens": 5606455.0, "reward": 0.8390895128250122, "reward_std": 1.2041540145874023, "rewards/rollout_reward_func/mean": 0.8390895128250122, "rewards/rollout_reward_func/std": 1.2041540145874023, "sampling/importance_sampling_ratio/max": 1.4446817636489868, "sampling/importance_sampling_ratio/mean": 0.7907516956329346, "sampling/importance_sampling_ratio/min": 6.693961040582508e-06, "sampling/sampling_logp_difference/max": 2.076903820037842, "sampling/sampling_logp_difference/mean": 0.26391270756721497, "step": 221, "step_time": 19.04231984895887 }, { "clip_ratio/high_max": 0.026988636702299118, "clip_ratio/high_mean": 0.013494318351149559, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.019744318444281816, "entropy": 1.2777636051177979, "epoch": 0.00222, "grad_norm": 0.1106252446770668, "kl": 0.5990213938057423, "learning_rate": 7.999987191195152e-06, "loss": -0.0935, "step": 222, "step_time": 10.255336048008758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 6.4375, "completions/mean_terminated_length": 4.666666507720947, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1776564940810204, "epoch": 0.00223, "frac_reward_zero_std": 0.0, "grad_norm": 0.05250415578484535, "kl": 0.9724632520228624, "learning_rate": 7.999987053095927e-06, "loss": -0.107, "num_tokens": 5664348.0, "reward": 1.1410046815872192, "reward_std": 1.1445441246032715, "rewards/rollout_reward_func/mean": 1.1410046815872192, "rewards/rollout_reward_func/std": 1.1445441246032715, "sampling/importance_sampling_ratio/max": 1.5567564964294434, "sampling/importance_sampling_ratio/mean": 0.7853524684906006, "sampling/importance_sampling_ratio/min": 1.5497851563850418e-05, "sampling/sampling_logp_difference/max": 1.921600103378296, "sampling/sampling_logp_difference/mean": 0.25880658626556396, "step": 223, "step_time": 23.844006118015386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 1.1799558140337467, "epoch": 0.00224, "grad_norm": 0.0536133348941803, "kl": 1.0029130317270756, "learning_rate": 7.999986914256228e-06, "loss": -0.107, "step": 224, "step_time": 12.739208789003897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.375, "completions/mean_terminated_length": 4.6666669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8070549052208662, "epoch": 0.00225, "frac_reward_zero_std": 0.25, "grad_norm": 0.054397158324718475, "kl": 0.41051068529486656, "learning_rate": 7.99998677467605e-06, "loss": -0.0489, "num_tokens": 5711958.0, "reward": 1.053309679031372, "reward_std": 1.208139181137085, "rewards/rollout_reward_func/mean": 1.053309679031372, "rewards/rollout_reward_func/std": 1.208139181137085, "sampling/importance_sampling_ratio/max": 1.7134239673614502, "sampling/importance_sampling_ratio/mean": 0.959867000579834, "sampling/importance_sampling_ratio/min": 9.286544081987813e-05, "sampling/sampling_logp_difference/max": 1.7219687700271606, "sampling/sampling_logp_difference/mean": 0.17574340105056763, "step": 225, "step_time": 23.898604218033142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0052083334885537624, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "entropy": 0.8061733059585094, "epoch": 0.00226, "grad_norm": 0.05347398295998573, "kl": 0.40747668594121933, "learning_rate": 7.999986634355396e-06, "loss": -0.0489, "step": 226, "step_time": 12.424823854977149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.96875, "completions/mean_terminated_length": 4.931034564971924, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.027611155062914, "epoch": 0.00227, "frac_reward_zero_std": 0.0, "grad_norm": 0.1834397315979004, "kl": 1.2998314760625362, "learning_rate": 7.999986493294267e-06, "loss": -0.0388, "num_tokens": 5765448.0, "reward": 0.33631467819213867, "reward_std": 1.2494704723358154, "rewards/rollout_reward_func/mean": 0.33631467819213867, "rewards/rollout_reward_func/std": 1.2494704723358154, "sampling/importance_sampling_ratio/max": 2.0232107639312744, "sampling/importance_sampling_ratio/mean": 0.9330057501792908, "sampling/importance_sampling_ratio/min": 2.749527538981056e-07, "sampling/sampling_logp_difference/max": 2.222156047821045, "sampling/sampling_logp_difference/mean": 0.2701842188835144, "step": 227, "step_time": 26.125802335998742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0351419299840927, "epoch": 0.00228, "grad_norm": 0.18352589011192322, "kl": 1.2421129532158375, "learning_rate": 7.99998635149266e-06, "loss": -0.0394, "step": 228, "step_time": 13.96015784201154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.71875, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2869275361299515, "epoch": 0.00229, "frac_reward_zero_std": 0.0, "grad_norm": 0.17282815277576447, "kl": 0.5564666576683521, "learning_rate": 7.99998620895058e-06, "loss": -0.0998, "num_tokens": 5828707.0, "reward": 0.9261161088943481, "reward_std": 1.189758062362671, "rewards/rollout_reward_func/mean": 0.9261161088943481, "rewards/rollout_reward_func/std": 1.1897581815719604, "sampling/importance_sampling_ratio/max": 1.6795859336853027, "sampling/importance_sampling_ratio/mean": 0.7052087783813477, "sampling/importance_sampling_ratio/min": 0.00022960449859965593, "sampling/sampling_logp_difference/max": 1.7194771766662598, "sampling/sampling_logp_difference/mean": 0.2578885555267334, "step": 229, "step_time": 27.31092225498287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2955214083194733, "epoch": 0.0023, "grad_norm": 0.16385464370250702, "kl": 0.5572249293327332, "learning_rate": 7.999986065668021e-06, "loss": -0.1, "step": 230, "step_time": 14.558058347000042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.78125, "completions/mean_terminated_length": 5.0416669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6066813319921494, "epoch": 0.00231, "frac_reward_zero_std": 0.0, "grad_norm": 0.10718535631895065, "kl": 0.48757896944880486, "learning_rate": 7.999985921644988e-06, "loss": -0.0561, "num_tokens": 5885965.0, "reward": 0.44866812229156494, "reward_std": 1.256722092628479, "rewards/rollout_reward_func/mean": 0.44866812229156494, "rewards/rollout_reward_func/std": 1.256722092628479, "sampling/importance_sampling_ratio/max": 2.1013312339782715, "sampling/importance_sampling_ratio/mean": 0.7192672491073608, "sampling/importance_sampling_ratio/min": 0.00061929872026667, "sampling/sampling_logp_difference/max": 1.7934539318084717, "sampling/sampling_logp_difference/mean": 0.276638388633728, "step": 231, "step_time": 23.543332989996998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.6116859540343285, "epoch": 0.00232, "grad_norm": 0.06967446953058243, "kl": 0.4999822434037924, "learning_rate": 7.999985776881479e-06, "loss": -0.0566, "step": 232, "step_time": 12.047815563026234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.6875, "completions/mean_terminated_length": 4.9166669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8179932162165642, "epoch": 0.00233, "frac_reward_zero_std": 0.0, "grad_norm": 0.083480104804039, "kl": 0.628327370621264, "learning_rate": 7.999985631377493e-06, "loss": -0.0638, "num_tokens": 5943945.0, "reward": 0.5246484279632568, "reward_std": 1.200555682182312, "rewards/rollout_reward_func/mean": 0.5246484279632568, "rewards/rollout_reward_func/std": 1.200555682182312, "sampling/importance_sampling_ratio/max": 1.804004192352295, "sampling/importance_sampling_ratio/mean": 0.7639685869216919, "sampling/importance_sampling_ratio/min": 7.038695848393672e-09, "sampling/sampling_logp_difference/max": 2.240501642227173, "sampling/sampling_logp_difference/mean": 0.34337159991264343, "step": 233, "step_time": 26.3083953789901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.824227713048458, "epoch": 0.00234, "grad_norm": 0.08916965126991272, "kl": 0.6188110681250691, "learning_rate": 7.99998548513303e-06, "loss": -0.0642, "step": 234, "step_time": 13.978779295037384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.8125, "completions/mean_terminated_length": 5.111111164093018, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.490475282073021, "epoch": 0.00235, "frac_reward_zero_std": 0.0, "grad_norm": 0.11095436662435532, "kl": 0.3333105929195881, "learning_rate": 7.999985338148094e-06, "loss": -0.0811, "num_tokens": 5991580.0, "reward": 0.800631046295166, "reward_std": 1.258049488067627, "rewards/rollout_reward_func/mean": 0.800631046295166, "rewards/rollout_reward_func/std": 1.2580493688583374, "sampling/importance_sampling_ratio/max": 1.9455991983413696, "sampling/importance_sampling_ratio/mean": 0.8360678553581238, "sampling/importance_sampling_ratio/min": 3.573251490252005e-07, "sampling/sampling_logp_difference/max": 2.4209470748901367, "sampling/sampling_logp_difference/mean": 0.28888002038002014, "step": 235, "step_time": 25.312861474987585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4996304661035538, "epoch": 0.00236, "grad_norm": 0.11599486321210861, "kl": 0.3306200848892331, "learning_rate": 7.99998519042268e-06, "loss": -0.0815, "step": 236, "step_time": 13.76759137501358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.875, "completions/mean_terminated_length": 4.695652484893799, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3935892432928085, "epoch": 0.00237, "frac_reward_zero_std": 0.0, "grad_norm": 0.0804717168211937, "kl": 0.4600952169857919, "learning_rate": 7.99998504195679e-06, "loss": -0.0991, "num_tokens": 6039049.0, "reward": 1.1529067754745483, "reward_std": 1.2538142204284668, "rewards/rollout_reward_func/mean": 1.1529067754745483, "rewards/rollout_reward_func/std": 1.2538142204284668, "sampling/importance_sampling_ratio/max": 2.6841607093811035, "sampling/importance_sampling_ratio/mean": 0.7724534869194031, "sampling/importance_sampling_ratio/min": 5.971245627733879e-06, "sampling/sampling_logp_difference/max": 1.7935266494750977, "sampling/sampling_logp_difference/mean": 0.26141875982284546, "step": 237, "step_time": 24.087761647009756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4004634469747543, "epoch": 0.00238, "grad_norm": 0.08749910444021225, "kl": 0.4421995310112834, "learning_rate": 7.999984892750425e-06, "loss": -0.0991, "step": 238, "step_time": 12.458777011997881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.5, "completions/mean_terminated_length": 5.119999885559082, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.648446723818779, "epoch": 0.00239, "frac_reward_zero_std": 0.0, "grad_norm": 0.21597081422805786, "kl": 0.30993655510246754, "learning_rate": 7.999984742803586e-06, "loss": -0.0866, "num_tokens": 6104594.0, "reward": 0.687299370765686, "reward_std": 1.2055617570877075, "rewards/rollout_reward_func/mean": 0.687299370765686, "rewards/rollout_reward_func/std": 1.2055617570877075, "sampling/importance_sampling_ratio/max": 1.5248632431030273, "sampling/importance_sampling_ratio/mean": 0.60545414686203, "sampling/importance_sampling_ratio/min": 0.00024063217279035598, "sampling/sampling_logp_difference/max": 1.6525217294692993, "sampling/sampling_logp_difference/mean": 0.2827250361442566, "step": 239, "step_time": 28.0888698730123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6504344791173935, "epoch": 0.0024, "grad_norm": 0.21755050122737885, "kl": 0.3214850425720215, "learning_rate": 7.999984592116268e-06, "loss": -0.0871, "step": 240, "step_time": 14.728121130989166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 7.40625, "completions/mean_terminated_length": 4.5416669845581055, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.477499034255743, "epoch": 0.00241, "frac_reward_zero_std": 0.0, "grad_norm": 0.09990420192480087, "kl": 0.31102152448147535, "learning_rate": 7.999984440688477e-06, "loss": -0.1115, "num_tokens": 6158871.0, "reward": 0.8692560195922852, "reward_std": 1.2837202548980713, "rewards/rollout_reward_func/mean": 0.8692560195922852, "rewards/rollout_reward_func/std": 1.2837202548980713, "sampling/importance_sampling_ratio/max": 2.235049247741699, "sampling/importance_sampling_ratio/mean": 0.7591169476509094, "sampling/importance_sampling_ratio/min": 5.2503337428788655e-06, "sampling/sampling_logp_difference/max": 2.437434673309326, "sampling/sampling_logp_difference/mean": 0.3101707994937897, "step": 241, "step_time": 27.21830835500441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4728505834937096, "epoch": 0.00242, "grad_norm": 0.10201866924762726, "kl": 0.3288099365308881, "learning_rate": 7.999984288520209e-06, "loss": -0.1117, "step": 242, "step_time": 13.700580949996947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.375, "completions/mean_terminated_length": 5.384615421295166, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3826408572494984, "epoch": 0.00243, "frac_reward_zero_std": 0.0, "grad_norm": 0.2786385118961334, "kl": 0.25304452888667583, "learning_rate": 7.999984135611465e-06, "loss": -0.0662, "num_tokens": 6218402.0, "reward": 0.2957342267036438, "reward_std": 1.1832104921340942, "rewards/rollout_reward_func/mean": 0.2957342267036438, "rewards/rollout_reward_func/std": 1.1832104921340942, "sampling/importance_sampling_ratio/max": 1.9254159927368164, "sampling/importance_sampling_ratio/mean": 0.7376142740249634, "sampling/importance_sampling_ratio/min": 0.0009867203189060092, "sampling/sampling_logp_difference/max": 1.537956714630127, "sampling/sampling_logp_difference/mean": 0.22620266675949097, "step": 243, "step_time": 26.540730061024078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3776583448052406, "epoch": 0.00244, "grad_norm": 0.27504754066467285, "kl": 0.24835946783423424, "learning_rate": 7.999983981962246e-06, "loss": -0.0688, "step": 244, "step_time": 13.986856439980329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.84375, "completions/mean_terminated_length": 5.559999942779541, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8628933355212212, "epoch": 0.00245, "frac_reward_zero_std": 0.0, "grad_norm": 0.19309021532535553, "kl": 0.29035993106663227, "learning_rate": 7.999983827572551e-06, "loss": -0.046, "num_tokens": 6282662.0, "reward": 0.7207238674163818, "reward_std": 1.1611700057983398, "rewards/rollout_reward_func/mean": 0.7207238674163818, "rewards/rollout_reward_func/std": 1.1611700057983398, "sampling/importance_sampling_ratio/max": 2.420081853866577, "sampling/importance_sampling_ratio/mean": 0.7473064661026001, "sampling/importance_sampling_ratio/min": 7.076202024336453e-08, "sampling/sampling_logp_difference/max": 2.436584949493408, "sampling/sampling_logp_difference/mean": 0.38341227173805237, "step": 245, "step_time": 27.68178200800321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8520237989723682, "epoch": 0.00246, "grad_norm": 0.1704372763633728, "kl": 0.28984597884118557, "learning_rate": 7.999983672442382e-06, "loss": -0.0482, "step": 246, "step_time": 14.37357484101085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.65625, "completions/mean_terminated_length": 5.3214287757873535, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3230876605957747, "epoch": 0.00247, "frac_reward_zero_std": 0.25, "grad_norm": 0.06820443272590637, "kl": 0.41048818454146385, "learning_rate": 7.999983516571737e-06, "loss": -0.0582, "num_tokens": 6327859.0, "reward": 1.0024962425231934, "reward_std": 1.2263967990875244, "rewards/rollout_reward_func/mean": 1.0024962425231934, "rewards/rollout_reward_func/std": 1.2263966798782349, "sampling/importance_sampling_ratio/max": 1.5986367464065552, "sampling/importance_sampling_ratio/mean": 0.8967238664627075, "sampling/importance_sampling_ratio/min": 1.9577952059535164e-07, "sampling/sampling_logp_difference/max": 1.6632417440414429, "sampling/sampling_logp_difference/mean": 0.26748520135879517, "step": 247, "step_time": 22.688984249994974 }, { "clip_ratio/high_max": 0.0062500000931322575, "clip_ratio/high_mean": 0.0031250000465661287, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 1.3202758878469467, "epoch": 0.00248, "grad_norm": 0.06587895005941391, "kl": 0.41511114314198494, "learning_rate": 7.999983359960615e-06, "loss": -0.0587, "step": 248, "step_time": 12.044456905001425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 7.4375, "completions/mean_terminated_length": 5.461538791656494, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.595795899629593, "epoch": 0.00249, "frac_reward_zero_std": 0.0, "grad_norm": 0.1204700917005539, "kl": 0.6194203719496727, "learning_rate": 7.999983202609019e-06, "loss": -0.104, "num_tokens": 6383097.0, "reward": 0.9296265840530396, "reward_std": 1.2502843141555786, "rewards/rollout_reward_func/mean": 0.9296265840530396, "rewards/rollout_reward_func/std": 1.2502843141555786, "sampling/importance_sampling_ratio/max": 1.6068971157073975, "sampling/importance_sampling_ratio/mean": 0.6997137665748596, "sampling/importance_sampling_ratio/min": 0.0004745457845274359, "sampling/sampling_logp_difference/max": 1.8889845609664917, "sampling/sampling_logp_difference/mean": 0.3082975149154663, "step": 249, "step_time": 26.911053394986084 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "entropy": 1.589806130155921, "epoch": 0.0025, "grad_norm": 0.11452977359294891, "kl": 0.5892547573894262, "learning_rate": 7.999983044516948e-06, "loss": -0.1037, "step": 250, "step_time": 13.574989660992287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 5.200000286102295, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.0445065796375275, "epoch": 0.00251, "frac_reward_zero_std": 0.0, "grad_norm": 0.21004153788089752, "kl": 0.2825557943433523, "learning_rate": 7.999982885684401e-06, "loss": -0.067, "num_tokens": 6439772.0, "reward": -0.019541755318641663, "reward_std": 1.1600631475448608, "rewards/rollout_reward_func/mean": -0.019541755318641663, "rewards/rollout_reward_func/std": 1.1600630283355713, "sampling/importance_sampling_ratio/max": 1.6308802366256714, "sampling/importance_sampling_ratio/mean": 0.5144264698028564, "sampling/importance_sampling_ratio/min": 1.490325075792498e-06, "sampling/sampling_logp_difference/max": 2.343034505844116, "sampling/sampling_logp_difference/mean": 0.3561582565307617, "step": 251, "step_time": 25.79570511901693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 2.059164971113205, "epoch": 0.00252, "grad_norm": 0.16256403923034668, "kl": 0.29295277735218406, "learning_rate": 7.99998272611138e-06, "loss": -0.0678, "step": 252, "step_time": 12.687551514012739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.59375, "completions/mean_terminated_length": 4.714285850524902, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.9851952716708183, "epoch": 0.00253, "frac_reward_zero_std": 0.0, "grad_norm": 0.13936251401901245, "kl": 0.4699612623080611, "learning_rate": 7.999982565797882e-06, "loss": -0.0946, "num_tokens": 6490214.0, "reward": 0.3154574930667877, "reward_std": 1.297609567642212, "rewards/rollout_reward_func/mean": 0.3154574930667877, "rewards/rollout_reward_func/std": 1.2976094484329224, "sampling/importance_sampling_ratio/max": 1.5301100015640259, "sampling/importance_sampling_ratio/mean": 0.5566830635070801, "sampling/importance_sampling_ratio/min": 0.00032698112772777677, "sampling/sampling_logp_difference/max": 2.1276893615722656, "sampling/sampling_logp_difference/mean": 0.309622198343277, "step": 253, "step_time": 25.790926664019935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9879225082695484, "epoch": 0.00254, "grad_norm": 0.13845939934253693, "kl": 0.4713856065645814, "learning_rate": 7.999982404743908e-06, "loss": -0.0946, "step": 254, "step_time": 13.210990980020142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.21875, "completions/mean_terminated_length": 4.5789475440979, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.833456002175808, "epoch": 0.00255, "frac_reward_zero_std": 0.0, "grad_norm": 0.07689225673675537, "kl": 0.5970685929059982, "learning_rate": 7.999982242949461e-06, "loss": -0.0903, "num_tokens": 6547297.0, "reward": 0.6498755216598511, "reward_std": 1.2667899131774902, "rewards/rollout_reward_func/mean": 0.6498755216598511, "rewards/rollout_reward_func/std": 1.2667899131774902, "sampling/importance_sampling_ratio/max": 2.961921215057373, "sampling/importance_sampling_ratio/mean": 0.6644665002822876, "sampling/importance_sampling_ratio/min": 1.8537880350777414e-06, "sampling/sampling_logp_difference/max": 3.5219616889953613, "sampling/sampling_logp_difference/mean": 0.3417049050331116, "step": 255, "step_time": 27.17495752699324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.832944924943149, "epoch": 0.00256, "grad_norm": 0.07928754389286041, "kl": 0.636608999222517, "learning_rate": 7.999982080414539e-06, "loss": -0.0903, "step": 256, "step_time": 13.555163338998682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.09375, "completions/mean_terminated_length": 6.7916669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9408605694770813, "epoch": 0.00257, "frac_reward_zero_std": 0.0, "grad_norm": 0.1658564656972885, "kl": 0.7799804834648967, "learning_rate": 7.999981917139141e-06, "loss": -0.0648, "num_tokens": 6601783.0, "reward": -0.40602537989616394, "reward_std": 1.072817087173462, "rewards/rollout_reward_func/mean": -0.40602537989616394, "rewards/rollout_reward_func/std": 1.072817087173462, "sampling/importance_sampling_ratio/max": 1.4136523008346558, "sampling/importance_sampling_ratio/mean": 0.5490615367889404, "sampling/importance_sampling_ratio/min": 0.0001635665976209566, "sampling/sampling_logp_difference/max": 3.947434663772583, "sampling/sampling_logp_difference/mean": 0.29653066396713257, "step": 257, "step_time": 27.181076638997183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.94082573056221, "epoch": 0.00258, "grad_norm": 0.16134794056415558, "kl": 0.7413769732229412, "learning_rate": 7.999981753123268e-06, "loss": -0.0648, "step": 258, "step_time": 14.433852676025708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 10.625, "completions/mean_terminated_length": 6.947368621826172, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.27869613468647, "epoch": 0.00259, "frac_reward_zero_std": 0.0, "grad_norm": 0.17804895341396332, "kl": 0.15926304645836353, "learning_rate": 7.999981588366921e-06, "loss": -0.0785, "num_tokens": 6655607.0, "reward": -0.1732836812734604, "reward_std": 1.1808457374572754, "rewards/rollout_reward_func/mean": -0.1732836812734604, "rewards/rollout_reward_func/std": 1.1808457374572754, "sampling/importance_sampling_ratio/max": 1.5060956478118896, "sampling/importance_sampling_ratio/mean": 0.4011918306350708, "sampling/importance_sampling_ratio/min": 5.595446964434814e-06, "sampling/sampling_logp_difference/max": 2.237823724746704, "sampling/sampling_logp_difference/mean": 0.3475033640861511, "step": 259, "step_time": 27.60689372201159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.2798516005277634, "epoch": 0.0026, "grad_norm": 0.18120983242988586, "kl": 0.1527521749958396, "learning_rate": 7.999981422870099e-06, "loss": -0.079, "step": 260, "step_time": 12.522758725986932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 9.96875, "completions/mean_terminated_length": 6.349999904632568, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.1278695315122604, "epoch": 0.00261, "frac_reward_zero_std": 0.0, "grad_norm": 0.26781150698661804, "kl": 0.542977956822142, "learning_rate": 7.999981256632802e-06, "loss": -0.0907, "num_tokens": 6706855.0, "reward": 0.43694427609443665, "reward_std": 1.344976544380188, "rewards/rollout_reward_func/mean": 0.43694427609443665, "rewards/rollout_reward_func/std": 1.344976544380188, "sampling/importance_sampling_ratio/max": 1.7809398174285889, "sampling/importance_sampling_ratio/mean": 0.491739958524704, "sampling/importance_sampling_ratio/min": 8.226048748838366e-07, "sampling/sampling_logp_difference/max": 1.9615046977996826, "sampling/sampling_logp_difference/mean": 0.3455028533935547, "step": 261, "step_time": 25.393499118988984 }, { "clip_ratio/high_max": 0.014912281185388565, "clip_ratio/high_mean": 0.0074561405926942825, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0074561405926942825, "entropy": 2.133992463350296, "epoch": 0.00262, "grad_norm": 0.22000211477279663, "kl": 0.5021492696832865, "learning_rate": 7.999981089655028e-06, "loss": -0.0919, "step": 262, "step_time": 12.29576659196755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.65625, "completions/mean_terminated_length": 6.599999904632568, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.145221382379532, "epoch": 0.00263, "frac_reward_zero_std": 0.0, "grad_norm": 0.13600778579711914, "kl": 0.7225791388191283, "learning_rate": 7.999980921936782e-06, "loss": -0.0738, "num_tokens": 6766959.0, "reward": 0.17076602578163147, "reward_std": 1.1204365491867065, "rewards/rollout_reward_func/mean": 0.17076602578163147, "rewards/rollout_reward_func/std": 1.1204365491867065, "sampling/importance_sampling_ratio/max": 2.4188270568847656, "sampling/importance_sampling_ratio/mean": 0.5006606578826904, "sampling/importance_sampling_ratio/min": 4.565918914067879e-07, "sampling/sampling_logp_difference/max": 2.589531421661377, "sampling/sampling_logp_difference/mean": 0.3993466794490814, "step": 263, "step_time": 27.56291347798833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.141732834279537, "epoch": 0.00264, "grad_norm": 0.14105308055877686, "kl": 0.6915302332490683, "learning_rate": 7.999980753478058e-06, "loss": -0.074, "step": 264, "step_time": 13.82173013499414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 9.90625, "completions/mean_terminated_length": 5.166666507720947, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.1177527457475662, "epoch": 0.00265, "frac_reward_zero_std": 0.0, "grad_norm": 0.08769354224205017, "kl": 0.24259313568472862, "learning_rate": 7.999980584278861e-06, "loss": -0.1198, "num_tokens": 6815229.0, "reward": 0.3932490348815918, "reward_std": 1.34110426902771, "rewards/rollout_reward_func/mean": 0.3932490348815918, "rewards/rollout_reward_func/std": 1.34110426902771, "sampling/importance_sampling_ratio/max": 1.6478407382965088, "sampling/importance_sampling_ratio/mean": 0.5365580320358276, "sampling/importance_sampling_ratio/min": 7.464888039976358e-05, "sampling/sampling_logp_difference/max": 2.5694239139556885, "sampling/sampling_logp_difference/mean": 0.3345493972301483, "step": 265, "step_time": 23.65831216998049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.1158678382635117, "epoch": 0.00266, "grad_norm": 0.07785570621490479, "kl": 0.24211128754541278, "learning_rate": 7.999980414339192e-06, "loss": -0.1201, "step": 266, "step_time": 10.500742797987186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 9.6875, "completions/mean_terminated_length": 4.777777671813965, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.0115113854408264, "epoch": 0.00267, "frac_reward_zero_std": 0.0, "grad_norm": 0.0950288400053978, "kl": 0.29187460453249514, "learning_rate": 7.999980243659046e-06, "loss": -0.0651, "num_tokens": 6862956.0, "reward": 0.004349023103713989, "reward_std": 1.2111707925796509, "rewards/rollout_reward_func/mean": 0.004349023103713989, "rewards/rollout_reward_func/std": 1.2111706733703613, "sampling/importance_sampling_ratio/max": 1.379745364189148, "sampling/importance_sampling_ratio/mean": 0.5332143306732178, "sampling/importance_sampling_ratio/min": 3.2934615319391014e-06, "sampling/sampling_logp_difference/max": 2.150033473968506, "sampling/sampling_logp_difference/mean": 0.31239137053489685, "step": 267, "step_time": 24.77673004299868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0133399218320847, "epoch": 0.00268, "grad_norm": 0.09179011732339859, "kl": 0.27953572873957455, "learning_rate": 7.999980072238424e-06, "loss": -0.0652, "step": 268, "step_time": 12.756779025978176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.65625, "completions/mean_terminated_length": 5.319999694824219, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.573663767427206, "epoch": 0.00269, "frac_reward_zero_std": 0.0, "grad_norm": 0.1014690175652504, "kl": 0.4081340190023184, "learning_rate": 7.999979900077329e-06, "loss": -0.0805, "num_tokens": 6912480.0, "reward": 0.6584208011627197, "reward_std": 1.3251863718032837, "rewards/rollout_reward_func/mean": 0.6584208011627197, "rewards/rollout_reward_func/std": 1.3251862525939941, "sampling/importance_sampling_ratio/max": 1.7179111242294312, "sampling/importance_sampling_ratio/mean": 0.7229540348052979, "sampling/importance_sampling_ratio/min": 0.00011605102918110788, "sampling/sampling_logp_difference/max": 2.139815092086792, "sampling/sampling_logp_difference/mean": 0.3219963610172272, "step": 269, "step_time": 25.892191303006257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.565867094323039, "epoch": 0.0027, "grad_norm": 0.09609243273735046, "kl": 0.42476138938218355, "learning_rate": 7.99997972717576e-06, "loss": -0.0806, "step": 270, "step_time": 13.727209305987344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.59375, "completions/mean_terminated_length": 5.239999771118164, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4334357045590878, "epoch": 0.00271, "frac_reward_zero_std": 0.0, "grad_norm": 0.11292635649442673, "kl": 1.3160812482237816, "learning_rate": 7.999979553533716e-06, "loss": -0.099, "num_tokens": 6969815.0, "reward": 0.3500032126903534, "reward_std": 1.2721855640411377, "rewards/rollout_reward_func/mean": 0.3500032126903534, "rewards/rollout_reward_func/std": 1.2721854448318481, "sampling/importance_sampling_ratio/max": 1.7349947690963745, "sampling/importance_sampling_ratio/mean": 0.692252516746521, "sampling/importance_sampling_ratio/min": 0.00044070908916182816, "sampling/sampling_logp_difference/max": 2.2630574703216553, "sampling/sampling_logp_difference/mean": 0.2833542227745056, "step": 271, "step_time": 26.780322677994263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.427693521603942, "epoch": 0.00272, "grad_norm": 0.11440466344356537, "kl": 1.3187714628875256, "learning_rate": 7.999979379151197e-06, "loss": -0.0995, "step": 272, "step_time": 13.703477327988367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 7.1875, "completions/mean_terminated_length": 4.71999979019165, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.340543705970049, "epoch": 0.00273, "frac_reward_zero_std": 0.0, "grad_norm": 0.1213015615940094, "kl": 0.32077929098159075, "learning_rate": 7.999979204028205e-06, "loss": -0.0526, "num_tokens": 7022978.0, "reward": 0.08557599782943726, "reward_std": 1.1315733194351196, "rewards/rollout_reward_func/mean": 0.08557599782943726, "rewards/rollout_reward_func/std": 1.1315733194351196, "sampling/importance_sampling_ratio/max": 1.6975353956222534, "sampling/importance_sampling_ratio/mean": 0.7271385192871094, "sampling/importance_sampling_ratio/min": 0.0009033535607159138, "sampling/sampling_logp_difference/max": 1.7712105512619019, "sampling/sampling_logp_difference/mean": 0.2469763159751892, "step": 273, "step_time": 25.671063711997704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3445356637239456, "epoch": 0.00274, "grad_norm": 0.12303794175386429, "kl": 0.3129330137744546, "learning_rate": 7.999979028164737e-06, "loss": -0.0528, "step": 274, "step_time": 13.343626330999541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.25, "completions/mean_terminated_length": 5.230769634246826, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.3058636281639338, "epoch": 0.00275, "frac_reward_zero_std": 0.0, "grad_norm": 0.06102129817008972, "kl": 0.6032416075468063, "learning_rate": 7.999978851560795e-06, "loss": -0.0881, "num_tokens": 7077360.0, "reward": 0.8084346055984497, "reward_std": 1.1525923013687134, "rewards/rollout_reward_func/mean": 0.8084346055984497, "rewards/rollout_reward_func/std": 1.1525923013687134, "sampling/importance_sampling_ratio/max": 1.4234281778335571, "sampling/importance_sampling_ratio/mean": 0.746221661567688, "sampling/importance_sampling_ratio/min": 7.298154400814383e-07, "sampling/sampling_logp_difference/max": 1.8908517360687256, "sampling/sampling_logp_difference/mean": 0.26642024517059326, "step": 275, "step_time": 22.288151931992616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3054400514811277, "epoch": 0.00276, "grad_norm": 0.059681329876184464, "kl": 0.5808637626469135, "learning_rate": 7.999978674216379e-06, "loss": -0.0882, "step": 276, "step_time": 11.196210200010682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.28125, "completions/mean_terminated_length": 5.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.036696057766676, "epoch": 0.00277, "frac_reward_zero_std": 0.0, "grad_norm": 0.17630499601364136, "kl": 0.27718696231022477, "learning_rate": 7.99997849613149e-06, "loss": -0.0759, "num_tokens": 7140403.0, "reward": 0.009615309536457062, "reward_std": 1.0234438180923462, "rewards/rollout_reward_func/mean": 0.009615309536457062, "rewards/rollout_reward_func/std": 1.0234438180923462, "sampling/importance_sampling_ratio/max": 1.4581480026245117, "sampling/importance_sampling_ratio/mean": 0.6103579998016357, "sampling/importance_sampling_ratio/min": 2.2166729252148798e-07, "sampling/sampling_logp_difference/max": 2.016991138458252, "sampling/sampling_logp_difference/mean": 0.3364870250225067, "step": 277, "step_time": 28.120424377018935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0342284925282, "epoch": 0.00278, "grad_norm": 0.18896040320396423, "kl": 0.273005501832813, "learning_rate": 7.999978317306126e-06, "loss": -0.0768, "step": 278, "step_time": 14.328954912998597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 10.0625, "completions/mean_terminated_length": 5.44444465637207, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.111194610595703, "epoch": 0.00279, "frac_reward_zero_std": 0.0, "grad_norm": 0.1652204692363739, "kl": 0.18024982139468193, "learning_rate": 7.999978137740288e-06, "loss": -0.0602, "num_tokens": 7197089.0, "reward": -0.3243561089038849, "reward_std": 0.9960245490074158, "rewards/rollout_reward_func/mean": -0.3243561089038849, "rewards/rollout_reward_func/std": 0.9960245490074158, "sampling/importance_sampling_ratio/max": 2.4756903648376465, "sampling/importance_sampling_ratio/mean": 0.5101466774940491, "sampling/importance_sampling_ratio/min": 2.9431935217871796e-06, "sampling/sampling_logp_difference/max": 2.1673033237457275, "sampling/sampling_logp_difference/mean": 0.3058164119720459, "step": 279, "step_time": 27.64672494601109 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.005842391401529312, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007795516401529312, "entropy": 2.1216538697481155, "epoch": 0.0028, "grad_norm": 0.15441612899303436, "kl": 0.176796720828861, "learning_rate": 7.999977957433975e-06, "loss": -0.0606, "step": 280, "step_time": 13.468066523011657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 8.15625, "completions/mean_terminated_length": 5.960000038146973, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8552885353565216, "epoch": 0.00281, "frac_reward_zero_std": 0.0, "grad_norm": 0.1690012365579605, "kl": 0.4678205093368888, "learning_rate": 7.999977776387188e-06, "loss": -0.1043, "num_tokens": 7246816.0, "reward": 0.5424714088439941, "reward_std": 1.3351202011108398, "rewards/rollout_reward_func/mean": 0.5424714088439941, "rewards/rollout_reward_func/std": 1.3351202011108398, "sampling/importance_sampling_ratio/max": 1.4380080699920654, "sampling/importance_sampling_ratio/mean": 0.6638157367706299, "sampling/importance_sampling_ratio/min": 0.00013272935757413507, "sampling/sampling_logp_difference/max": 1.670287847518921, "sampling/sampling_logp_difference/mean": 0.2993823289871216, "step": 281, "step_time": 26.11548476901953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.859150156378746, "epoch": 0.00282, "grad_norm": 0.17018955945968628, "kl": 0.44447993393987417, "learning_rate": 7.999977594599927e-06, "loss": -0.1044, "step": 282, "step_time": 13.564957601993228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 9.53125, "completions/mean_terminated_length": 5.650000095367432, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9870405048131943, "epoch": 0.00283, "frac_reward_zero_std": 0.0, "grad_norm": 0.2139492779970169, "kl": 0.34349778620526195, "learning_rate": 7.999977412072193e-06, "loss": -0.1032, "num_tokens": 7310557.0, "reward": 0.5055357217788696, "reward_std": 1.1444616317749023, "rewards/rollout_reward_func/mean": 0.5055357217788696, "rewards/rollout_reward_func/std": 1.1444615125656128, "sampling/importance_sampling_ratio/max": 1.6860079765319824, "sampling/importance_sampling_ratio/mean": 0.5673609972000122, "sampling/importance_sampling_ratio/min": 1.1434370890128775e-06, "sampling/sampling_logp_difference/max": 2.0500335693359375, "sampling/sampling_logp_difference/mean": 0.3386031687259674, "step": 283, "step_time": 29.56933155300794 }, { "clip_ratio/high_max": 0.0062500000931322575, "clip_ratio/high_mean": 0.0031250000465661287, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 1.9872873723506927, "epoch": 0.00284, "grad_norm": 0.1249164268374443, "kl": 0.33668724657036364, "learning_rate": 7.999977228803984e-06, "loss": -0.1039, "step": 284, "step_time": 14.554727392998757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 8.9375, "completions/mean_terminated_length": 5.727272987365723, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8372978642582893, "epoch": 0.00285, "frac_reward_zero_std": 0.0, "grad_norm": 0.11127051711082458, "kl": 0.2748618680052459, "learning_rate": 7.999977044795302e-06, "loss": -0.0784, "num_tokens": 7362229.0, "reward": -0.2216966450214386, "reward_std": 1.083838701248169, "rewards/rollout_reward_func/mean": -0.2216966450214386, "rewards/rollout_reward_func/std": 1.0838388204574585, "sampling/importance_sampling_ratio/max": 1.386260986328125, "sampling/importance_sampling_ratio/mean": 0.5563018321990967, "sampling/importance_sampling_ratio/min": 3.5662076697917655e-05, "sampling/sampling_logp_difference/max": 1.9197916984558105, "sampling/sampling_logp_difference/mean": 0.30937302112579346, "step": 285, "step_time": 27.588462932020775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8380355536937714, "epoch": 0.00286, "grad_norm": 0.1176062673330307, "kl": 0.2769333883188665, "learning_rate": 7.999976860046145e-06, "loss": -0.0787, "step": 286, "step_time": 13.899882144993171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 9.125, "completions/mean_terminated_length": 5.523809432983398, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.971745915710926, "epoch": 0.00287, "frac_reward_zero_std": 0.0, "grad_norm": 0.22831416130065918, "kl": 0.28408463625237346, "learning_rate": 7.999976674556518e-06, "loss": -0.0511, "num_tokens": 7419045.0, "reward": 0.5462408661842346, "reward_std": 1.3210880756378174, "rewards/rollout_reward_func/mean": 0.5462408661842346, "rewards/rollout_reward_func/std": 1.3210880756378174, "sampling/importance_sampling_ratio/max": 1.781974196434021, "sampling/importance_sampling_ratio/mean": 0.5812981128692627, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.9383258819580078, "sampling/sampling_logp_difference/mean": 0.32753288745880127, "step": 287, "step_time": 28.109730829979526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9690172858536243, "epoch": 0.00288, "grad_norm": 0.21287430822849274, "kl": 0.28140733437612653, "learning_rate": 7.999976488326414e-06, "loss": -0.0524, "step": 288, "step_time": 14.722721565020038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 9.125, "completions/mean_terminated_length": 5.523809432983398, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.8713792115449905, "epoch": 0.00289, "frac_reward_zero_std": 0.0, "grad_norm": 0.15170027315616608, "kl": 0.2264924757182598, "learning_rate": 7.999976301355836e-06, "loss": -0.1218, "num_tokens": 7473916.0, "reward": 0.761043906211853, "reward_std": 1.2298558950424194, "rewards/rollout_reward_func/mean": 0.761043906211853, "rewards/rollout_reward_func/std": 1.2298557758331299, "sampling/importance_sampling_ratio/max": 1.9991849660873413, "sampling/importance_sampling_ratio/mean": 0.6437795162200928, "sampling/importance_sampling_ratio/min": 1.4029504313839425e-07, "sampling/sampling_logp_difference/max": 2.154130697250366, "sampling/sampling_logp_difference/mean": 0.3182738721370697, "step": 289, "step_time": 27.89025465598388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.86648690700531, "epoch": 0.0029, "grad_norm": 0.15275073051452637, "kl": 0.22599350288510323, "learning_rate": 7.999976113644787e-06, "loss": -0.1224, "step": 290, "step_time": 13.838826363993576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 4.727272987365723, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.729001708328724, "epoch": 0.00291, "frac_reward_zero_std": 0.0, "grad_norm": 0.08766661584377289, "kl": 0.2940732976421714, "learning_rate": 7.999975925193261e-06, "loss": -0.0843, "num_tokens": 7534033.0, "reward": 0.20402027666568756, "reward_std": 1.15549898147583, "rewards/rollout_reward_func/mean": 0.20402027666568756, "rewards/rollout_reward_func/std": 1.15549898147583, "sampling/importance_sampling_ratio/max": 1.502363681793213, "sampling/importance_sampling_ratio/mean": 0.710674524307251, "sampling/importance_sampling_ratio/min": 1.1606669431785122e-05, "sampling/sampling_logp_difference/max": 2.123753547668457, "sampling/sampling_logp_difference/mean": 0.3025546371936798, "step": 291, "step_time": 28.90108515501197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7273261956870556, "epoch": 0.00292, "grad_norm": 0.08953963220119476, "kl": 0.2972093904390931, "learning_rate": 7.999975736001263e-06, "loss": -0.0842, "step": 292, "step_time": 14.525411703041755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.65625, "completions/mean_terminated_length": 5.782608985900879, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.079817084595561, "epoch": 0.00293, "frac_reward_zero_std": 0.0, "grad_norm": 0.11838682740926743, "kl": 0.31936451187357306, "learning_rate": 7.999975546068793e-06, "loss": -0.0998, "num_tokens": 7590933.0, "reward": 0.10907240957021713, "reward_std": 1.183161735534668, "rewards/rollout_reward_func/mean": 0.10907240957021713, "rewards/rollout_reward_func/std": 1.183161735534668, "sampling/importance_sampling_ratio/max": 1.5652122497558594, "sampling/importance_sampling_ratio/mean": 0.5647162199020386, "sampling/importance_sampling_ratio/min": 0.00013434413995128125, "sampling/sampling_logp_difference/max": 2.160071849822998, "sampling/sampling_logp_difference/mean": 0.3448008894920349, "step": 293, "step_time": 24.86352143199474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 2.0727580580860376, "epoch": 0.00294, "grad_norm": 0.11652345955371857, "kl": 0.35317050432786345, "learning_rate": 7.999975355395847e-06, "loss": -0.0999, "step": 294, "step_time": 13.094725128015853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 7.125, "completions/mean_terminated_length": 4.639999866485596, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.38578312844038, "epoch": 0.00295, "frac_reward_zero_std": 0.0, "grad_norm": 0.15566487610340118, "kl": 0.19181895395740867, "learning_rate": 7.999975163982429e-06, "loss": -0.0634, "num_tokens": 7653017.0, "reward": -0.0976799726486206, "reward_std": 0.9076073169708252, "rewards/rollout_reward_func/mean": -0.0976799726486206, "rewards/rollout_reward_func/std": 0.9076072573661804, "sampling/importance_sampling_ratio/max": 1.5081599950790405, "sampling/importance_sampling_ratio/mean": 0.8055872917175293, "sampling/importance_sampling_ratio/min": 2.4995966668939218e-05, "sampling/sampling_logp_difference/max": 1.5779764652252197, "sampling/sampling_logp_difference/mean": 0.26389628648757935, "step": 295, "step_time": 26.281692966033006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3778353556990623, "epoch": 0.00296, "grad_norm": 0.15043263137340546, "kl": 0.19688293058425188, "learning_rate": 7.999974971828538e-06, "loss": -0.0637, "step": 296, "step_time": 13.80295888800174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.71875, "completions/mean_terminated_length": 4.576923370361328, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.3175729401409626, "epoch": 0.00297, "frac_reward_zero_std": 0.25, "grad_norm": 0.08314503729343414, "kl": 0.22959680762141943, "learning_rate": 7.999974778934173e-06, "loss": -0.0638, "num_tokens": 7702986.0, "reward": 0.8001668453216553, "reward_std": 1.1778866052627563, "rewards/rollout_reward_func/mean": 0.8001668453216553, "rewards/rollout_reward_func/std": 1.1778866052627563, "sampling/importance_sampling_ratio/max": 1.3869574069976807, "sampling/importance_sampling_ratio/mean": 0.8567808866500854, "sampling/importance_sampling_ratio/min": 0.0003597200848162174, "sampling/sampling_logp_difference/max": 1.8430508375167847, "sampling/sampling_logp_difference/mean": 0.2530401647090912, "step": 297, "step_time": 24.149001841040445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3079493753612041, "epoch": 0.00298, "grad_norm": 0.08289262652397156, "kl": 0.23294775746762753, "learning_rate": 7.999974585299335e-06, "loss": -0.0643, "step": 298, "step_time": 12.615743010988808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 6.28125, "completions/mean_terminated_length": 4.481481552124023, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.1306415125727654, "epoch": 0.00299, "frac_reward_zero_std": 0.0, "grad_norm": 0.11553040891885757, "kl": 1.1676307283341885, "learning_rate": 7.999974390924023e-06, "loss": -0.0802, "num_tokens": 7749609.0, "reward": 0.8987911939620972, "reward_std": 1.2999262809753418, "rewards/rollout_reward_func/mean": 0.8987911939620972, "rewards/rollout_reward_func/std": 1.2999262809753418, "sampling/importance_sampling_ratio/max": 1.6121199131011963, "sampling/importance_sampling_ratio/mean": 0.7836276292800903, "sampling/importance_sampling_ratio/min": 6.1485970945796e-06, "sampling/sampling_logp_difference/max": 1.871612548828125, "sampling/sampling_logp_difference/mean": 0.29713574051856995, "step": 299, "step_time": 19.72408454300603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 1.122834425419569, "epoch": 0.003, "grad_norm": 0.11759527772665024, "kl": 1.2041240967810154, "learning_rate": 7.999974195808239e-06, "loss": -0.0802, "step": 300, "step_time": 10.094645155986655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.625, "completions/mean_terminated_length": 4.933333396911621, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9350505955517292, "epoch": 0.00301, "frac_reward_zero_std": 0.0, "grad_norm": 0.1642601639032364, "kl": 0.8307745419442654, "learning_rate": 7.999973999951982e-06, "loss": -0.0772, "num_tokens": 7796900.0, "reward": 1.195993185043335, "reward_std": 1.1691087484359741, "rewards/rollout_reward_func/mean": 1.195993185043335, "rewards/rollout_reward_func/std": 1.1691086292266846, "sampling/importance_sampling_ratio/max": 1.618563175201416, "sampling/importance_sampling_ratio/mean": 0.9065151214599609, "sampling/importance_sampling_ratio/min": 5.82055035920348e-05, "sampling/sampling_logp_difference/max": 2.0369889736175537, "sampling/sampling_logp_difference/mean": 0.24913768470287323, "step": 301, "step_time": 21.70114622398978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9348099697381258, "epoch": 0.00302, "grad_norm": 0.16581377387046814, "kl": 0.8463116250932217, "learning_rate": 7.99997380335525e-06, "loss": -0.0774, "step": 302, "step_time": 12.593046111040167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 6.53125, "completions/mean_terminated_length": 4.34615421295166, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1100751608610153, "epoch": 0.00303, "frac_reward_zero_std": 0.0, "grad_norm": 0.1254730373620987, "kl": 0.7873565014451742, "learning_rate": 7.999973606018048e-06, "loss": -0.0444, "num_tokens": 7852302.0, "reward": 0.21820122003555298, "reward_std": 1.050923466682434, "rewards/rollout_reward_func/mean": 0.21820122003555298, "rewards/rollout_reward_func/std": 1.050923466682434, "sampling/importance_sampling_ratio/max": 1.4630824327468872, "sampling/importance_sampling_ratio/mean": 0.780868649482727, "sampling/importance_sampling_ratio/min": 5.2812783906119876e-06, "sampling/sampling_logp_difference/max": 2.683427095413208, "sampling/sampling_logp_difference/mean": 0.2636966109275818, "step": 303, "step_time": 26.681638125999598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004807692486792803, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004807692486792803, "entropy": 1.1090156380087137, "epoch": 0.00304, "grad_norm": 0.12405872344970703, "kl": 0.8109412658959627, "learning_rate": 7.99997340794037e-06, "loss": -0.0448, "step": 304, "step_time": 14.829756825987715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.65625, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1329403780400753, "epoch": 0.00305, "frac_reward_zero_std": 0.0, "grad_norm": 0.07949597388505936, "kl": 0.36719343066215515, "learning_rate": 7.999973209122222e-06, "loss": -0.0847, "num_tokens": 7908091.0, "reward": 1.01381254196167, "reward_std": 1.1934760808944702, "rewards/rollout_reward_func/mean": 1.01381254196167, "rewards/rollout_reward_func/std": 1.1934759616851807, "sampling/importance_sampling_ratio/max": 1.7921817302703857, "sampling/importance_sampling_ratio/mean": 0.9206175804138184, "sampling/importance_sampling_ratio/min": 0.00014555214147549123, "sampling/sampling_logp_difference/max": 2.0007920265197754, "sampling/sampling_logp_difference/mean": 0.23139184713363647, "step": 305, "step_time": 24.754036063997773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1365325152873993, "epoch": 0.00306, "grad_norm": 0.08057068288326263, "kl": 0.36989606730639935, "learning_rate": 7.999973009563599e-06, "loss": -0.0844, "step": 306, "step_time": 12.73858139901131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.71875, "completions/mean_terminated_length": 4.958333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7425406239926815, "epoch": 0.00307, "frac_reward_zero_std": 0.0, "grad_norm": 0.08964479714632034, "kl": 0.32046238519251347, "learning_rate": 7.999972809264505e-06, "loss": -0.0974, "num_tokens": 7966750.0, "reward": 0.44764232635498047, "reward_std": 1.2277263402938843, "rewards/rollout_reward_func/mean": 0.44764232635498047, "rewards/rollout_reward_func/std": 1.2277263402938843, "sampling/importance_sampling_ratio/max": 1.612242341041565, "sampling/importance_sampling_ratio/mean": 0.7308894991874695, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.0517258644104004, "sampling/sampling_logp_difference/mean": 0.30502983927726746, "step": 307, "step_time": 27.501347754034214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7442905008792877, "epoch": 0.00308, "grad_norm": 0.09425173699855804, "kl": 0.3176264539361, "learning_rate": 7.999972608224937e-06, "loss": -0.0976, "step": 308, "step_time": 13.807916245990782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.21875, "completions/mean_terminated_length": 4.2916669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1553622297942638, "epoch": 0.00309, "frac_reward_zero_std": 0.0, "grad_norm": 0.12732942402362823, "kl": 0.24532300233840942, "learning_rate": 7.999972406444895e-06, "loss": -0.097, "num_tokens": 8020460.0, "reward": 0.5777797698974609, "reward_std": 1.330564022064209, "rewards/rollout_reward_func/mean": 0.5777797698974609, "rewards/rollout_reward_func/std": 1.3305639028549194, "sampling/importance_sampling_ratio/max": 1.6050662994384766, "sampling/importance_sampling_ratio/mean": 0.8520320653915405, "sampling/importance_sampling_ratio/min": 0.00010966208355966955, "sampling/sampling_logp_difference/max": 1.6411139965057373, "sampling/sampling_logp_difference/mean": 0.2254658192396164, "step": 309, "step_time": 25.44837073799863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.157265231013298, "epoch": 0.0031, "grad_norm": 0.1223834902048111, "kl": 0.2442499678581953, "learning_rate": 7.999972203924383e-06, "loss": -0.0974, "step": 310, "step_time": 13.360904654997285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 6.46875, "completions/mean_terminated_length": 4.703703880310059, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.21495552174747, "epoch": 0.00311, "frac_reward_zero_std": 0.0, "grad_norm": 0.22885499894618988, "kl": 0.8215315751731396, "learning_rate": 7.999972000663396e-06, "loss": -0.0813, "num_tokens": 8066000.0, "reward": 0.4192485213279724, "reward_std": 1.3388216495513916, "rewards/rollout_reward_func/mean": 0.4192485213279724, "rewards/rollout_reward_func/std": 1.3388216495513916, "sampling/importance_sampling_ratio/max": 1.330858826637268, "sampling/importance_sampling_ratio/mean": 0.7029498815536499, "sampling/importance_sampling_ratio/min": 1.645378461034852e-06, "sampling/sampling_logp_difference/max": 2.8349416255950928, "sampling/sampling_logp_difference/mean": 0.2695930004119873, "step": 311, "step_time": 25.751644945004955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2204051800072193, "epoch": 0.00312, "grad_norm": 0.18416154384613037, "kl": 0.5622367821633816, "learning_rate": 7.999971796661938e-06, "loss": -0.0823, "step": 312, "step_time": 13.506184640995343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 8.9375, "completions/mean_terminated_length": 4.1052632331848145, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6953502297401428, "epoch": 0.00313, "frac_reward_zero_std": 0.0, "grad_norm": 0.15440446138381958, "kl": 0.3224534271284938, "learning_rate": 7.999971591920007e-06, "loss": -0.0879, "num_tokens": 8121819.0, "reward": 0.6265389919281006, "reward_std": 1.245671033859253, "rewards/rollout_reward_func/mean": 0.6265389919281006, "rewards/rollout_reward_func/std": 1.245671033859253, "sampling/importance_sampling_ratio/max": 1.2960108518600464, "sampling/importance_sampling_ratio/mean": 0.5717513561248779, "sampling/importance_sampling_ratio/min": 2.8732729333569296e-05, "sampling/sampling_logp_difference/max": 1.6427589654922485, "sampling/sampling_logp_difference/mean": 0.3179978132247925, "step": 313, "step_time": 27.56833545299014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6957179196178913, "epoch": 0.00314, "grad_norm": 0.1754288226366043, "kl": 0.32867103535681963, "learning_rate": 7.999971386437603e-06, "loss": -0.0886, "step": 314, "step_time": 14.37740238699189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.21875, "completions/mean_terminated_length": 6.42307710647583, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6279140301048756, "epoch": 0.00315, "frac_reward_zero_std": 0.0, "grad_norm": 0.13927413523197174, "kl": 0.5341947362758219, "learning_rate": 7.999971180214728e-06, "loss": -0.0582, "num_tokens": 8172668.0, "reward": 0.7619888782501221, "reward_std": 1.3366063833236694, "rewards/rollout_reward_func/mean": 0.7619888782501221, "rewards/rollout_reward_func/std": 1.3366062641143799, "sampling/importance_sampling_ratio/max": 1.8630290031433105, "sampling/importance_sampling_ratio/mean": 0.6355208158493042, "sampling/importance_sampling_ratio/min": 6.857460539322346e-05, "sampling/sampling_logp_difference/max": 1.7857451438903809, "sampling/sampling_logp_difference/mean": 0.32262179255485535, "step": 315, "step_time": 26.1614560810267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6296303067356348, "epoch": 0.00316, "grad_norm": 0.1364070475101471, "kl": 0.5253024301491678, "learning_rate": 7.99997097325138e-06, "loss": -0.0584, "step": 316, "step_time": 13.855756483986625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 6.153846263885498, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.82303886115551, "epoch": 0.00317, "frac_reward_zero_std": 0.0, "grad_norm": 0.08708295971155167, "kl": 0.6139407493174076, "learning_rate": 7.999970765547559e-06, "loss": -0.1113, "num_tokens": 8226627.0, "reward": 0.31399473547935486, "reward_std": 1.3040460348129272, "rewards/rollout_reward_func/mean": 0.31399473547935486, "rewards/rollout_reward_func/std": 1.3040459156036377, "sampling/importance_sampling_ratio/max": 2.524850368499756, "sampling/importance_sampling_ratio/mean": 0.6546834707260132, "sampling/importance_sampling_ratio/min": 1.690237877483014e-05, "sampling/sampling_logp_difference/max": 2.364579200744629, "sampling/sampling_logp_difference/mean": 0.3427892029285431, "step": 317, "step_time": 27.64167962600186 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.8216225355863571, "epoch": 0.00318, "grad_norm": 0.0909726545214653, "kl": 0.6329101771116257, "learning_rate": 7.999970557103267e-06, "loss": -0.1115, "step": 318, "step_time": 13.868587351971655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.90625, "completions/mean_terminated_length": 4.807692527770996, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.231896037235856, "epoch": 0.00319, "frac_reward_zero_std": 0.0, "grad_norm": 0.10038219392299652, "kl": 0.3930556084960699, "learning_rate": 7.999970347918501e-06, "loss": -0.0863, "num_tokens": 8273321.0, "reward": -0.052729591727256775, "reward_std": 1.0548903942108154, "rewards/rollout_reward_func/mean": -0.052729591727256775, "rewards/rollout_reward_func/std": 1.0548903942108154, "sampling/importance_sampling_ratio/max": 1.4928160905838013, "sampling/importance_sampling_ratio/mean": 0.8576729893684387, "sampling/importance_sampling_ratio/min": 1.2895665349788032e-05, "sampling/sampling_logp_difference/max": 2.046393871307373, "sampling/sampling_logp_difference/mean": 0.2642376124858856, "step": 319, "step_time": 23.429250087967375 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.2265208400785923, "epoch": 0.0032, "grad_norm": 0.03809111937880516, "kl": 0.4319003392010927, "learning_rate": 7.999970137993264e-06, "loss": -0.0866, "step": 320, "step_time": 12.542684927000664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.59375, "completions/mean_terminated_length": 4.851851940155029, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.080748114734888, "epoch": 0.00321, "frac_reward_zero_std": 0.0, "grad_norm": 0.1272544264793396, "kl": 0.49910086020827293, "learning_rate": 7.999969927327556e-06, "loss": -0.0762, "num_tokens": 8330111.0, "reward": 0.6444181203842163, "reward_std": 1.2440388202667236, "rewards/rollout_reward_func/mean": 0.6444181203842163, "rewards/rollout_reward_func/std": 1.2440388202667236, "sampling/importance_sampling_ratio/max": 1.7269037961959839, "sampling/importance_sampling_ratio/mean": 0.7980140447616577, "sampling/importance_sampling_ratio/min": 0.0005975493113510311, "sampling/sampling_logp_difference/max": 1.7530982494354248, "sampling/sampling_logp_difference/mean": 0.23656219244003296, "step": 321, "step_time": 26.41955542699725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0755917392671108, "epoch": 0.00322, "grad_norm": 0.11520314961671829, "kl": 0.509143827483058, "learning_rate": 7.999969715921373e-06, "loss": -0.0764, "step": 322, "step_time": 13.997268086008262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 5.1875, "completions/mean_terminated_length": 4.068965435028076, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.8323838589712977, "epoch": 0.00323, "frac_reward_zero_std": 0.25, "grad_norm": 0.12934349477291107, "kl": 0.876643056049943, "learning_rate": 7.999969503774719e-06, "loss": -0.0538, "num_tokens": 8387303.0, "reward": 1.031559705734253, "reward_std": 1.1114102602005005, "rewards/rollout_reward_func/mean": 1.031559705734253, "rewards/rollout_reward_func/std": 1.1114102602005005, "sampling/importance_sampling_ratio/max": 1.7685271501541138, "sampling/importance_sampling_ratio/mean": 1.0587430000305176, "sampling/importance_sampling_ratio/min": 0.0011204974725842476, "sampling/sampling_logp_difference/max": 2.2598347663879395, "sampling/sampling_logp_difference/mean": 0.20560964941978455, "step": 323, "step_time": 29.233689859029255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.003289473708719015, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003289473708719015, "entropy": 0.8344784956425428, "epoch": 0.00324, "grad_norm": 0.13114623725414276, "kl": 0.9009084440767765, "learning_rate": 7.999969290887594e-06, "loss": -0.0542, "step": 324, "step_time": 16.353057616972364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 4.857142925262451, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8748100940138102, "epoch": 0.00325, "frac_reward_zero_std": 0.0, "grad_norm": 0.0864519327878952, "kl": 0.6546545121818781, "learning_rate": 7.999969077259998e-06, "loss": -0.0794, "num_tokens": 8436667.0, "reward": 0.40272244811058044, "reward_std": 1.2148569822311401, "rewards/rollout_reward_func/mean": 0.40272244811058044, "rewards/rollout_reward_func/std": 1.2148569822311401, "sampling/importance_sampling_ratio/max": 1.4564601182937622, "sampling/importance_sampling_ratio/mean": 0.8003513813018799, "sampling/importance_sampling_ratio/min": 4.404628271004185e-05, "sampling/sampling_logp_difference/max": 2.7065975666046143, "sampling/sampling_logp_difference/mean": 0.2469252347946167, "step": 325, "step_time": 25.443527094015735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8729008007794619, "epoch": 0.00326, "grad_norm": 0.08373407274484634, "kl": 0.6335103511810303, "learning_rate": 7.999968862891929e-06, "loss": -0.0796, "step": 326, "step_time": 13.370648836978944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005681818351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.9375, "completions/mean_terminated_length": 5.642857551574707, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3038648189976811, "epoch": 0.00327, "frac_reward_zero_std": 0.0, "grad_norm": 0.0916319414973259, "kl": 0.5344808055087924, "learning_rate": 7.999968647783389e-06, "loss": -0.0947, "num_tokens": 8491161.0, "reward": 0.7241368293762207, "reward_std": 1.3181771039962769, "rewards/rollout_reward_func/mean": 0.7241368293762207, "rewards/rollout_reward_func/std": 1.3181771039962769, "sampling/importance_sampling_ratio/max": 1.6433309316635132, "sampling/importance_sampling_ratio/mean": 0.7653930187225342, "sampling/importance_sampling_ratio/min": 2.717406823649071e-05, "sampling/sampling_logp_difference/max": 3.0524511337280273, "sampling/sampling_logp_difference/mean": 0.28736764192581177, "step": 327, "step_time": 26.729830155018135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2986666848883033, "epoch": 0.00328, "grad_norm": 0.08319491147994995, "kl": 0.4676893036812544, "learning_rate": 7.999968431934376e-06, "loss": -0.0951, "step": 328, "step_time": 14.180088009001338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 4.65625, "completions/mean_terminated_length": 4.290322303771973, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.43319022189825773, "epoch": 0.00329, "frac_reward_zero_std": 0.5, "grad_norm": 0.06944133341312408, "kl": 0.4652301426976919, "learning_rate": 7.999968215344892e-06, "loss": -0.0173, "num_tokens": 8532886.0, "reward": 0.8695398569107056, "reward_std": 1.1465084552764893, "rewards/rollout_reward_func/mean": 0.8695398569107056, "rewards/rollout_reward_func/std": 1.1465084552764893, "sampling/importance_sampling_ratio/max": 1.2878338098526, "sampling/importance_sampling_ratio/mean": 1.0283234119415283, "sampling/importance_sampling_ratio/min": 0.0017749069957062602, "sampling/sampling_logp_difference/max": 1.610843539237976, "sampling/sampling_logp_difference/mean": 0.10439471900463104, "step": 329, "step_time": 21.354729505997966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4338369248434901, "epoch": 0.0033, "grad_norm": 0.0695871114730835, "kl": 0.47111791744828224, "learning_rate": 7.999967998014936e-06, "loss": -0.0175, "step": 330, "step_time": 12.304382226982852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 9.75, "completions/mean_terminated_length": 5.473684310913086, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.85105724260211, "epoch": 0.00331, "frac_reward_zero_std": 0.0, "grad_norm": 0.24583201110363007, "kl": 0.19286451488733292, "learning_rate": 7.999967779944508e-06, "loss": -0.081, "num_tokens": 8592859.0, "reward": 0.1476484090089798, "reward_std": 1.2115434408187866, "rewards/rollout_reward_func/mean": 0.1476484090089798, "rewards/rollout_reward_func/std": 1.211543321609497, "sampling/importance_sampling_ratio/max": 1.7869526147842407, "sampling/importance_sampling_ratio/mean": 0.5564246773719788, "sampling/importance_sampling_ratio/min": 3.885778824042063e-06, "sampling/sampling_logp_difference/max": 1.7823987007141113, "sampling/sampling_logp_difference/mean": 0.3230445981025696, "step": 331, "step_time": 27.75028311900678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.853858768939972, "epoch": 0.00332, "grad_norm": 0.2272561639547348, "kl": 0.1903864871710539, "learning_rate": 7.99996756113361e-06, "loss": -0.0819, "step": 332, "step_time": 13.530650600034278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 5.625, "completions/mean_terminated_length": 4.551723957061768, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9744086004793644, "epoch": 0.00333, "frac_reward_zero_std": 0.0, "grad_norm": 0.26732131838798523, "kl": 0.3967187125235796, "learning_rate": 7.999967341582239e-06, "loss": -0.0534, "num_tokens": 8644644.0, "reward": -0.07435324788093567, "reward_std": 1.1285443305969238, "rewards/rollout_reward_func/mean": -0.07435324788093567, "rewards/rollout_reward_func/std": 1.1285442113876343, "sampling/importance_sampling_ratio/max": 1.3718891143798828, "sampling/importance_sampling_ratio/mean": 0.8433834910392761, "sampling/importance_sampling_ratio/min": 0.0002775354660116136, "sampling/sampling_logp_difference/max": 1.8530962467193604, "sampling/sampling_logp_difference/mean": 0.21120841801166534, "step": 333, "step_time": 24.795721386006335 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.9765747599303722, "epoch": 0.00334, "grad_norm": 0.23273617029190063, "kl": 0.3912400100380182, "learning_rate": 7.999967121290396e-06, "loss": -0.0545, "step": 334, "step_time": 12.726650932992925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 7.03125, "completions/mean_terminated_length": 4.519999980926514, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3560115098953247, "epoch": 0.00335, "frac_reward_zero_std": 0.0, "grad_norm": 0.12712334096431732, "kl": 0.2674891725182533, "learning_rate": 7.999966900258084e-06, "loss": -0.0701, "num_tokens": 8696573.0, "reward": 0.5020980834960938, "reward_std": 1.3956724405288696, "rewards/rollout_reward_func/mean": 0.5020980834960938, "rewards/rollout_reward_func/std": 1.3956724405288696, "sampling/importance_sampling_ratio/max": 1.5002055168151855, "sampling/importance_sampling_ratio/mean": 0.7959368824958801, "sampling/importance_sampling_ratio/min": 4.283211637812201e-06, "sampling/sampling_logp_difference/max": 2.129265785217285, "sampling/sampling_logp_difference/mean": 0.2765524387359619, "step": 335, "step_time": 25.073066234981525 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "entropy": 1.3610313050448895, "epoch": 0.00336, "grad_norm": 0.12379708141088486, "kl": 0.2551179323345423, "learning_rate": 7.9999666784853e-06, "loss": -0.0701, "step": 336, "step_time": 12.636304115963867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.40625, "completions/mean_terminated_length": 5.035714626312256, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4040322043001652, "epoch": 0.00337, "frac_reward_zero_std": 0.0, "grad_norm": 0.15529049932956696, "kl": 0.6834291201084852, "learning_rate": 7.999966455972044e-06, "loss": -0.0707, "num_tokens": 8751960.0, "reward": 0.355624794960022, "reward_std": 1.298406720161438, "rewards/rollout_reward_func/mean": 0.355624794960022, "rewards/rollout_reward_func/std": 1.298406720161438, "sampling/importance_sampling_ratio/max": 1.3334107398986816, "sampling/importance_sampling_ratio/mean": 0.7296727895736694, "sampling/importance_sampling_ratio/min": 5.477407739817863e-06, "sampling/sampling_logp_difference/max": 1.5861198902130127, "sampling/sampling_logp_difference/mean": 0.2990870177745819, "step": 337, "step_time": 24.288972634996753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4258082211017609, "epoch": 0.00338, "grad_norm": 0.1750793159008026, "kl": 0.6537726074457169, "learning_rate": 7.999966232718316e-06, "loss": -0.0717, "step": 338, "step_time": 13.117754031001823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.15625, "completions/mean_terminated_length": 6.34615421295166, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6825934126973152, "epoch": 0.00339, "frac_reward_zero_std": 0.0, "grad_norm": 0.06575275957584381, "kl": 0.19776008278131485, "learning_rate": 7.999966008724119e-06, "loss": -0.0845, "num_tokens": 8802996.0, "reward": 0.7644074559211731, "reward_std": 1.3221949338912964, "rewards/rollout_reward_func/mean": 0.7644074559211731, "rewards/rollout_reward_func/std": 1.3221949338912964, "sampling/importance_sampling_ratio/max": 1.2873064279556274, "sampling/importance_sampling_ratio/mean": 0.6780903339385986, "sampling/importance_sampling_ratio/min": 9.136982407653704e-05, "sampling/sampling_logp_difference/max": 1.6050779819488525, "sampling/sampling_logp_difference/mean": 0.2732822000980377, "step": 339, "step_time": 23.404130599999917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6905236942693591, "epoch": 0.0034, "grad_norm": 0.07039886713027954, "kl": 0.18256525322794914, "learning_rate": 7.99996578398945e-06, "loss": -0.0848, "step": 340, "step_time": 11.5627652300318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 8.75, "completions/mean_terminated_length": 5.454545497894287, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.057474836707115, "epoch": 0.00341, "frac_reward_zero_std": 0.0, "grad_norm": 0.17047573626041412, "kl": 0.49764249566942453, "learning_rate": 7.99996555851431e-06, "loss": -0.1023, "num_tokens": 8863838.0, "reward": 0.4162064790725708, "reward_std": 1.24704909324646, "rewards/rollout_reward_func/mean": 0.4162064790725708, "rewards/rollout_reward_func/std": 1.2470492124557495, "sampling/importance_sampling_ratio/max": 2.29103946685791, "sampling/importance_sampling_ratio/mean": 0.6819858551025391, "sampling/importance_sampling_ratio/min": 8.42562712932704e-06, "sampling/sampling_logp_difference/max": 1.909177541732788, "sampling/sampling_logp_difference/mean": 0.3565942645072937, "step": 341, "step_time": 33.18740596201678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0636845529079437, "epoch": 0.00342, "grad_norm": 0.150556743144989, "kl": 0.4316622302867472, "learning_rate": 7.999965332298698e-06, "loss": -0.1027, "step": 342, "step_time": 16.12817498300865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.15625, "completions/mean_terminated_length": 5.115384578704834, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4389455784112215, "epoch": 0.00343, "frac_reward_zero_std": 0.0, "grad_norm": 0.0627848282456398, "kl": 0.5877519380301237, "learning_rate": 7.999965105342615e-06, "loss": -0.1021, "num_tokens": 8919728.0, "reward": 0.9859083890914917, "reward_std": 1.1567820310592651, "rewards/rollout_reward_func/mean": 0.9859083890914917, "rewards/rollout_reward_func/std": 1.1567819118499756, "sampling/importance_sampling_ratio/max": 1.588557243347168, "sampling/importance_sampling_ratio/mean": 0.7784475088119507, "sampling/importance_sampling_ratio/min": 0.00016585142293479294, "sampling/sampling_logp_difference/max": 1.6663269996643066, "sampling/sampling_logp_difference/mean": 0.26234158873558044, "step": 343, "step_time": 27.948008784020203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4380555003881454, "epoch": 0.00344, "grad_norm": 0.06498058140277863, "kl": 0.614694369956851, "learning_rate": 7.999964877646064e-06, "loss": -0.1019, "step": 344, "step_time": 13.903079691022867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.90625, "completions/mean_terminated_length": 6.130434989929199, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.0201530754566193, "epoch": 0.00345, "frac_reward_zero_std": 0.0, "grad_norm": 0.1059243381023407, "kl": 0.30481171142309904, "learning_rate": 7.99996464920904e-06, "loss": -0.0827, "num_tokens": 8969099.0, "reward": 0.5870963335037231, "reward_std": 1.442370057106018, "rewards/rollout_reward_func/mean": 0.5870963335037231, "rewards/rollout_reward_func/std": 1.442370057106018, "sampling/importance_sampling_ratio/max": 1.3437936305999756, "sampling/importance_sampling_ratio/mean": 0.5542685389518738, "sampling/importance_sampling_ratio/min": 9.46533473324962e-05, "sampling/sampling_logp_difference/max": 1.739785075187683, "sampling/sampling_logp_difference/mean": 0.28353744745254517, "step": 345, "step_time": 26.84096059401054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0257329642772675, "epoch": 0.00346, "grad_norm": 0.10856226086616516, "kl": 0.3135433024726808, "learning_rate": 7.999964420031546e-06, "loss": -0.0827, "step": 346, "step_time": 12.577600897027878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 4.740740776062012, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2880258709192276, "epoch": 0.00347, "frac_reward_zero_std": 0.0, "grad_norm": 0.09274237602949142, "kl": 0.26986537873744965, "learning_rate": 7.99996419011358e-06, "loss": -0.073, "num_tokens": 9024282.0, "reward": 0.8553228974342346, "reward_std": 1.2362524271011353, "rewards/rollout_reward_func/mean": 0.8553228974342346, "rewards/rollout_reward_func/std": 1.2362524271011353, "sampling/importance_sampling_ratio/max": 1.5107015371322632, "sampling/importance_sampling_ratio/mean": 0.8736566305160522, "sampling/importance_sampling_ratio/min": 5.876123054804339e-07, "sampling/sampling_logp_difference/max": 2.0817575454711914, "sampling/sampling_logp_difference/mean": 0.2874860167503357, "step": 347, "step_time": 26.54271373798838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2789839878678322, "epoch": 0.00348, "grad_norm": 0.095316082239151, "kl": 0.28138336539268494, "learning_rate": 7.999963959455145e-06, "loss": -0.0733, "step": 348, "step_time": 14.026130834012292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 9.0625, "completions/mean_terminated_length": 7.119999885559082, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9335633926093578, "epoch": 0.00349, "frac_reward_zero_std": 0.0, "grad_norm": 0.08540402352809906, "kl": 0.5984769468195736, "learning_rate": 7.999963728056238e-06, "loss": -0.085, "num_tokens": 9077374.0, "reward": 0.14659541845321655, "reward_std": 1.1035795211791992, "rewards/rollout_reward_func/mean": 0.14659541845321655, "rewards/rollout_reward_func/std": 1.1035795211791992, "sampling/importance_sampling_ratio/max": 1.3999998569488525, "sampling/importance_sampling_ratio/mean": 0.5157189965248108, "sampling/importance_sampling_ratio/min": 8.074015909187438e-07, "sampling/sampling_logp_difference/max": 3.233771800994873, "sampling/sampling_logp_difference/mean": 0.39135798811912537, "step": 349, "step_time": 26.959103002009215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9284060709178448, "epoch": 0.0035, "grad_norm": 0.0880630761384964, "kl": 0.6529390760697424, "learning_rate": 7.99996349591686e-06, "loss": -0.0847, "step": 350, "step_time": 12.603769373978139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.84375, "completions/mean_terminated_length": 5.559999942779541, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7682045847177505, "epoch": 0.00351, "frac_reward_zero_std": 0.0, "grad_norm": 0.10283374786376953, "kl": 0.3728863410651684, "learning_rate": 7.999963263037014e-06, "loss": -0.0967, "num_tokens": 9130942.0, "reward": -0.11492574214935303, "reward_std": 1.1238025426864624, "rewards/rollout_reward_func/mean": -0.11492574214935303, "rewards/rollout_reward_func/std": 1.1238025426864624, "sampling/importance_sampling_ratio/max": 2.570951223373413, "sampling/importance_sampling_ratio/mean": 0.6346198320388794, "sampling/importance_sampling_ratio/min": 0.0008943102438934147, "sampling/sampling_logp_difference/max": 1.6870263814926147, "sampling/sampling_logp_difference/mean": 0.28739669919013977, "step": 351, "step_time": 26.22839587702765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 1.7614651769399643, "epoch": 0.00352, "grad_norm": 0.08978747576475143, "kl": 0.39971924759447575, "learning_rate": 7.999963029416695e-06, "loss": -0.097, "step": 352, "step_time": 12.64839703097823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.625, "completions/mean_terminated_length": 4.888888835906982, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2915400071069598, "epoch": 0.00353, "frac_reward_zero_std": 0.25, "grad_norm": 0.12132946401834488, "kl": 0.803594745695591, "learning_rate": 7.999962795055906e-06, "loss": -0.0781, "num_tokens": 9177698.0, "reward": 0.9269940853118896, "reward_std": 1.364303469657898, "rewards/rollout_reward_func/mean": 0.9269940853118896, "rewards/rollout_reward_func/std": 1.364303469657898, "sampling/importance_sampling_ratio/max": 2.167135000228882, "sampling/importance_sampling_ratio/mean": 0.7770435810089111, "sampling/importance_sampling_ratio/min": 0.0004685812455136329, "sampling/sampling_logp_difference/max": 1.8749052286148071, "sampling/sampling_logp_difference/mean": 0.2739955186843872, "step": 353, "step_time": 23.968654210999375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.284713911358267, "epoch": 0.00354, "grad_norm": 0.12134270370006561, "kl": 0.8432599063962698, "learning_rate": 7.99996255995465e-06, "loss": -0.0784, "step": 354, "step_time": 11.860244751980645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.4375, "completions/mean_terminated_length": 5.0714287757873535, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3252780195325613, "epoch": 0.00355, "frac_reward_zero_std": 0.25, "grad_norm": 0.06353209912776947, "kl": 0.31513593066483736, "learning_rate": 7.99996232411292e-06, "loss": -0.0668, "num_tokens": 9219514.0, "reward": 1.0895582437515259, "reward_std": 1.368558406829834, "rewards/rollout_reward_func/mean": 1.0895582437515259, "rewards/rollout_reward_func/std": 1.3685585260391235, "sampling/importance_sampling_ratio/max": 1.4904040098190308, "sampling/importance_sampling_ratio/mean": 0.8323930501937866, "sampling/importance_sampling_ratio/min": 1.1132708550576353e-06, "sampling/sampling_logp_difference/max": 1.6897752285003662, "sampling/sampling_logp_difference/mean": 0.23923997581005096, "step": 355, "step_time": 29.135079157029395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3189793080091476, "epoch": 0.00356, "grad_norm": 0.056123603135347366, "kl": 0.32481354009360075, "learning_rate": 7.999962087530722e-06, "loss": -0.0669, "step": 356, "step_time": 15.804268340027193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.78125, "completions/mean_terminated_length": 5.0416669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9280472099781036, "epoch": 0.00357, "frac_reward_zero_std": 0.0, "grad_norm": 0.15973597764968872, "kl": 0.2918248884379864, "learning_rate": 7.999961850208053e-06, "loss": -0.0944, "num_tokens": 9272303.0, "reward": 0.22621580958366394, "reward_std": 1.3238849639892578, "rewards/rollout_reward_func/mean": 0.22621580958366394, "rewards/rollout_reward_func/std": 1.3238849639892578, "sampling/importance_sampling_ratio/max": 1.393789529800415, "sampling/importance_sampling_ratio/mean": 0.5937265157699585, "sampling/importance_sampling_ratio/min": 2.198862966906745e-05, "sampling/sampling_logp_difference/max": 2.0719850063323975, "sampling/sampling_logp_difference/mean": 0.35019612312316895, "step": 357, "step_time": 25.788972030990408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9240614026784897, "epoch": 0.00358, "grad_norm": 0.16812407970428467, "kl": 0.3157515712082386, "learning_rate": 7.999961612144914e-06, "loss": -0.0947, "step": 358, "step_time": 13.011746789998142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.90625, "completions/mean_terminated_length": 5.639999866485596, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8119503227062523, "epoch": 0.00359, "frac_reward_zero_std": 0.0, "grad_norm": 0.14364011585712433, "kl": 1.359858587384224, "learning_rate": 7.999961373341304e-06, "loss": -0.0931, "num_tokens": 9325076.0, "reward": 0.937332808971405, "reward_std": 1.27029550075531, "rewards/rollout_reward_func/mean": 0.937332808971405, "rewards/rollout_reward_func/std": 1.27029550075531, "sampling/importance_sampling_ratio/max": 1.4402461051940918, "sampling/importance_sampling_ratio/mean": 0.6762588024139404, "sampling/importance_sampling_ratio/min": 6.37913472019136e-05, "sampling/sampling_logp_difference/max": 2.0879838466644287, "sampling/sampling_logp_difference/mean": 0.3348020315170288, "step": 359, "step_time": 26.199482943004114 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0034722222480922937, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034722222480922937, "entropy": 1.8162142322398722, "epoch": 0.0036, "grad_norm": 0.11295735090970993, "kl": 1.140316914767027, "learning_rate": 7.999961133797226e-06, "loss": -0.0939, "step": 360, "step_time": 12.764008996993653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.40625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 9.8125, "completions/mean_terminated_length": 5.5789475440979, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.0063316971063614, "epoch": 0.00361, "frac_reward_zero_std": 0.0, "grad_norm": 0.10709656029939651, "kl": 0.48796646669507027, "learning_rate": 7.999960893512676e-06, "loss": -0.1038, "num_tokens": 9377937.0, "reward": 0.28861093521118164, "reward_std": 1.3330881595611572, "rewards/rollout_reward_func/mean": 0.28861093521118164, "rewards/rollout_reward_func/std": 1.3330881595611572, "sampling/importance_sampling_ratio/max": 1.3113808631896973, "sampling/importance_sampling_ratio/mean": 0.480202317237854, "sampling/importance_sampling_ratio/min": 8.744286787987221e-06, "sampling/sampling_logp_difference/max": 2.120666980743408, "sampling/sampling_logp_difference/mean": 0.34197258949279785, "step": 361, "step_time": 24.975460556030157 }, { "clip_ratio/high_max": 0.004464285913854837, "clip_ratio/high_mean": 0.0022321429569274187, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0053571430034935474, "entropy": 2.010585444048047, "epoch": 0.00362, "grad_norm": 0.07702119648456573, "kl": 0.374748095870018, "learning_rate": 7.999960652487659e-06, "loss": -0.1044, "step": 362, "step_time": 10.600052171983407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 9.3125, "completions/mean_terminated_length": 4.736842155456543, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.420042037963867, "epoch": 0.00363, "frac_reward_zero_std": 0.0, "grad_norm": 0.13076552748680115, "kl": 0.8927284753881395, "learning_rate": 7.99996041072217e-06, "loss": -0.1087, "num_tokens": 9432148.0, "reward": 0.42327672243118286, "reward_std": 1.409048080444336, "rewards/rollout_reward_func/mean": 0.42327672243118286, "rewards/rollout_reward_func/std": 1.409048080444336, "sampling/importance_sampling_ratio/max": 1.3696461915969849, "sampling/importance_sampling_ratio/mean": 0.5539258718490601, "sampling/importance_sampling_ratio/min": 1.9718759958209375e-08, "sampling/sampling_logp_difference/max": 2.001840353012085, "sampling/sampling_logp_difference/mean": 0.4005240201950073, "step": 363, "step_time": 32.67459907998273 }, { "clip_ratio/high_max": 0.017316017765551805, "clip_ratio/high_mean": 0.008658008882775903, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008658008882775903, "entropy": 2.4252839535474777, "epoch": 0.00364, "grad_norm": 0.09243443608283997, "kl": 0.7019840623252094, "learning_rate": 7.999960168216212e-06, "loss": -0.1092, "step": 364, "step_time": 15.717917402958847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 9.8125, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.230557307600975, "epoch": 0.00365, "frac_reward_zero_std": 0.0, "grad_norm": 0.06516712158918381, "kl": 0.1931259986013174, "learning_rate": 7.999959924969784e-06, "loss": -0.1002, "num_tokens": 9490603.0, "reward": 0.21224814653396606, "reward_std": 1.180526852607727, "rewards/rollout_reward_func/mean": 0.21224814653396606, "rewards/rollout_reward_func/std": 1.1805269718170166, "sampling/importance_sampling_ratio/max": 1.815176248550415, "sampling/importance_sampling_ratio/mean": 0.5781981348991394, "sampling/importance_sampling_ratio/min": 3.4996466524717107e-07, "sampling/sampling_logp_difference/max": 2.181375503540039, "sampling/sampling_logp_difference/mean": 0.3826737701892853, "step": 365, "step_time": 28.36590955501015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.2390059381723404, "epoch": 0.00366, "grad_norm": 0.07118599116802216, "kl": 0.19752124696969986, "learning_rate": 7.999959680982886e-06, "loss": -0.1001, "step": 366, "step_time": 14.294569240970304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.78125, "completions/mean_terminated_length": 5.956521987915039, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.851223163306713, "epoch": 0.00367, "frac_reward_zero_std": 0.0, "grad_norm": 0.24504123628139496, "kl": 0.19367980165407062, "learning_rate": 7.99995943625552e-06, "loss": -0.0873, "num_tokens": 9536111.0, "reward": 0.709824800491333, "reward_std": 1.5080070495605469, "rewards/rollout_reward_func/mean": 0.709824800491333, "rewards/rollout_reward_func/std": 1.5080071687698364, "sampling/importance_sampling_ratio/max": 1.1488374471664429, "sampling/importance_sampling_ratio/mean": 0.5221021175384521, "sampling/importance_sampling_ratio/min": 0.0002608664508443326, "sampling/sampling_logp_difference/max": 1.7961324453353882, "sampling/sampling_logp_difference/mean": 0.30140525102615356, "step": 367, "step_time": 24.555494518004707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8486871384084225, "epoch": 0.00368, "grad_norm": 0.2267598807811737, "kl": 0.19532544305548072, "learning_rate": 7.999959190787684e-06, "loss": -0.0885, "step": 368, "step_time": 11.79749241497484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.96875, "completions/mean_terminated_length": 6.809524059295654, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.2067830562591553, "epoch": 0.00369, "frac_reward_zero_std": 0.0, "grad_norm": 0.4014725685119629, "kl": 2.819200007710606, "learning_rate": 7.999958944579377e-06, "loss": -0.0735, "num_tokens": 9589053.0, "reward": -0.1172209233045578, "reward_std": 1.1778974533081055, "rewards/rollout_reward_func/mean": -0.1172209233045578, "rewards/rollout_reward_func/std": 1.1778974533081055, "sampling/importance_sampling_ratio/max": 1.5936273336410522, "sampling/importance_sampling_ratio/mean": 0.445989191532135, "sampling/importance_sampling_ratio/min": 0.00043078523594886065, "sampling/sampling_logp_difference/max": 1.6316399574279785, "sampling/sampling_logp_difference/mean": 0.31254613399505615, "step": 369, "step_time": 27.46512817501207 }, { "clip_ratio/high_max": 0.02211538515985012, "clip_ratio/high_mean": 0.01105769257992506, "clip_ratio/low_mean": 0.0022321429569274187, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013289835769683123, "entropy": 2.206077754497528, "epoch": 0.0037, "grad_norm": 0.2595287263393402, "kl": 2.097373898141086, "learning_rate": 7.999958697630603e-06, "loss": -0.0777, "step": 370, "step_time": 12.857455085992115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 7.375, "completions/mean_terminated_length": 4.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6428016759455204, "epoch": 0.00371, "frac_reward_zero_std": 0.25, "grad_norm": 0.0930514931678772, "kl": 0.5985755631700158, "learning_rate": 7.999958449941359e-06, "loss": -0.0732, "num_tokens": 9635080.0, "reward": 0.6863677501678467, "reward_std": 1.3724074363708496, "rewards/rollout_reward_func/mean": 0.6863677501678467, "rewards/rollout_reward_func/std": 1.3724074363708496, "sampling/importance_sampling_ratio/max": 1.2421752214431763, "sampling/importance_sampling_ratio/mean": 0.6661715507507324, "sampling/importance_sampling_ratio/min": 4.3382803482927557e-07, "sampling/sampling_logp_difference/max": 2.0622358322143555, "sampling/sampling_logp_difference/mean": 0.3299395442008972, "step": 371, "step_time": 27.62405862797459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6371106915175915, "epoch": 0.00372, "grad_norm": 0.11031374335289001, "kl": 0.5841893358156085, "learning_rate": 7.999958201511645e-06, "loss": -0.0729, "step": 372, "step_time": 13.502982648016769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 9.4375, "completions/mean_terminated_length": 5.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.378083661198616, "epoch": 0.00373, "frac_reward_zero_std": 0.25, "grad_norm": 0.02707354724407196, "kl": 0.1565653057768941, "learning_rate": 7.999957952341462e-06, "loss": -0.0662, "num_tokens": 9687488.0, "reward": 0.25962328910827637, "reward_std": 1.3428598642349243, "rewards/rollout_reward_func/mean": 0.25962328910827637, "rewards/rollout_reward_func/std": 1.3428598642349243, "sampling/importance_sampling_ratio/max": 1.4743120670318604, "sampling/importance_sampling_ratio/mean": 0.5818606615066528, "sampling/importance_sampling_ratio/min": 1.1503485666253255e-06, "sampling/sampling_logp_difference/max": 2.159019947052002, "sampling/sampling_logp_difference/mean": 0.39294371008872986, "step": 373, "step_time": 27.321901778006577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 2.386591821908951, "epoch": 0.00374, "grad_norm": 0.026319613680243492, "kl": 0.15911375265568495, "learning_rate": 7.99995770243081e-06, "loss": -0.0662, "step": 374, "step_time": 12.51474907201191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.96875, "completions/mean_terminated_length": 4.535714626312256, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.372976554557681, "epoch": 0.00375, "frac_reward_zero_std": 0.0, "grad_norm": 0.19754284620285034, "kl": 1.0965501433238387, "learning_rate": 7.999957451779688e-06, "loss": -0.0835, "num_tokens": 9736319.0, "reward": 0.7866365909576416, "reward_std": 1.2815463542938232, "rewards/rollout_reward_func/mean": 0.7866365909576416, "rewards/rollout_reward_func/std": 1.2815463542938232, "sampling/importance_sampling_ratio/max": 1.9768686294555664, "sampling/importance_sampling_ratio/mean": 0.9022121429443359, "sampling/importance_sampling_ratio/min": 1.2326013347774278e-05, "sampling/sampling_logp_difference/max": 1.7927006483078003, "sampling/sampling_logp_difference/mean": 0.28388965129852295, "step": 375, "step_time": 25.90780571299547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004807692486792803, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004807692486792803, "entropy": 1.3704926716163754, "epoch": 0.00376, "grad_norm": 0.07436896860599518, "kl": 0.9793935995548964, "learning_rate": 7.9999572003881e-06, "loss": -0.0851, "step": 376, "step_time": 13.578172806999646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.78125, "completions/mean_terminated_length": 4.653846263885498, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.389672172255814, "epoch": 0.00377, "frac_reward_zero_std": 0.0, "grad_norm": 0.057014163583517075, "kl": 0.4015487050637603, "learning_rate": 7.99995694825604e-06, "loss": -0.0821, "num_tokens": 9792229.0, "reward": 0.3231789469718933, "reward_std": 1.1929855346679688, "rewards/rollout_reward_func/mean": 0.3231789469718933, "rewards/rollout_reward_func/std": 1.1929855346679688, "sampling/importance_sampling_ratio/max": 1.470153570175171, "sampling/importance_sampling_ratio/mean": 0.816735565662384, "sampling/importance_sampling_ratio/min": 0.00039673084393143654, "sampling/sampling_logp_difference/max": 1.4518203735351562, "sampling/sampling_logp_difference/mean": 0.2523108124732971, "step": 377, "step_time": 26.113590592009132 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 1.3872738759964705, "epoch": 0.00378, "grad_norm": 0.04244864359498024, "kl": 0.42046866938471794, "learning_rate": 7.999956695383513e-06, "loss": -0.0823, "step": 378, "step_time": 13.206538326965529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.78125, "completions/mean_terminated_length": 5.884615421295166, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6623510047793388, "epoch": 0.00379, "frac_reward_zero_std": 0.0, "grad_norm": 0.03585031256079674, "kl": 0.45493657886981964, "learning_rate": 7.999956441770516e-06, "loss": -0.0988, "num_tokens": 9840844.0, "reward": 0.9434516429901123, "reward_std": 1.277527928352356, "rewards/rollout_reward_func/mean": 0.9434516429901123, "rewards/rollout_reward_func/std": 1.277527928352356, "sampling/importance_sampling_ratio/max": 1.2586439847946167, "sampling/importance_sampling_ratio/mean": 0.7146462202072144, "sampling/importance_sampling_ratio/min": 0.00015916347911115736, "sampling/sampling_logp_difference/max": 1.7375366687774658, "sampling/sampling_logp_difference/mean": 0.30218929052352905, "step": 379, "step_time": 32.57071359800466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.659704066812992, "epoch": 0.0038, "grad_norm": 0.03774097189307213, "kl": 0.4693446885794401, "learning_rate": 7.999956187417052e-06, "loss": -0.0988, "step": 380, "step_time": 15.866553754996858 }, { "clip_ratio/high_max": 0.005434782709926367, "clip_ratio/high_mean": 0.0027173913549631834, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0027173913549631834, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 5.3125, "completions/mean_terminated_length": 4.2068963050842285, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.6631325762718916, "epoch": 0.00381, "frac_reward_zero_std": 0.0, "grad_norm": 0.07458977401256561, "kl": 0.4496031478047371, "learning_rate": 7.999955932323117e-06, "loss": -0.0558, "num_tokens": 9892498.0, "reward": 1.3641387224197388, "reward_std": 1.0302015542984009, "rewards/rollout_reward_func/mean": 1.3641387224197388, "rewards/rollout_reward_func/std": 1.0302015542984009, "sampling/importance_sampling_ratio/max": 2.3444151878356934, "sampling/importance_sampling_ratio/mean": 0.995896577835083, "sampling/importance_sampling_ratio/min": 0.00028650599415414035, "sampling/sampling_logp_difference/max": 1.6611135005950928, "sampling/sampling_logp_difference/mean": 0.16344960033893585, "step": 381, "step_time": 25.43592314698617 }, { "clip_ratio/high_max": 0.005434782709926367, "clip_ratio/high_mean": 0.0027173913549631834, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0027173913549631834, "entropy": 0.6647712578997016, "epoch": 0.00382, "grad_norm": 0.07326092571020126, "kl": 0.44391210936009884, "learning_rate": 7.999955676488715e-06, "loss": -0.0558, "step": 382, "step_time": 13.377616834026412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 7.65625, "completions/mean_terminated_length": 4.3913044929504395, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6500128023326397, "epoch": 0.00383, "frac_reward_zero_std": 0.0, "grad_norm": 0.06674909591674805, "kl": 0.28511063009500504, "learning_rate": 7.999955419913844e-06, "loss": -0.1115, "num_tokens": 9949458.0, "reward": 0.9429978728294373, "reward_std": 1.2292603254318237, "rewards/rollout_reward_func/mean": 0.9429978728294373, "rewards/rollout_reward_func/std": 1.2292602062225342, "sampling/importance_sampling_ratio/max": 1.4890638589859009, "sampling/importance_sampling_ratio/mean": 0.7363608479499817, "sampling/importance_sampling_ratio/min": 4.38266988567193e-06, "sampling/sampling_logp_difference/max": 1.7609431743621826, "sampling/sampling_logp_difference/mean": 0.2922914922237396, "step": 383, "step_time": 26.419642531007412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6509860325604677, "epoch": 0.00384, "grad_norm": 0.060391444712877274, "kl": 0.2824115017428994, "learning_rate": 7.999955162598504e-06, "loss": -0.1115, "step": 384, "step_time": 12.288667308021104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.9375, "completions/mean_terminated_length": 4.84615421295166, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5182328894734383, "epoch": 0.00385, "frac_reward_zero_std": 0.0, "grad_norm": 0.22104139626026154, "kl": 1.1373971654102206, "learning_rate": 7.999954904542697e-06, "loss": -0.0599, "num_tokens": 10001615.0, "reward": 0.3972712755203247, "reward_std": 1.1729503870010376, "rewards/rollout_reward_func/mean": 0.3972712755203247, "rewards/rollout_reward_func/std": 1.1729503870010376, "sampling/importance_sampling_ratio/max": 1.4610015153884888, "sampling/importance_sampling_ratio/mean": 0.7286748886108398, "sampling/importance_sampling_ratio/min": 9.237371614290169e-07, "sampling/sampling_logp_difference/max": 1.7449442148208618, "sampling/sampling_logp_difference/mean": 0.3066626787185669, "step": 385, "step_time": 24.96190016999026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5179831609129906, "epoch": 0.00386, "grad_norm": 0.20417103171348572, "kl": 1.1341164652258158, "learning_rate": 7.999954645746422e-06, "loss": -0.0613, "step": 386, "step_time": 12.695287236987497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.5625, "completions/mean_terminated_length": 4.482758522033691, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1755916038528085, "epoch": 0.00387, "frac_reward_zero_std": 0.0, "grad_norm": 0.30210697650909424, "kl": 0.8571810200810432, "learning_rate": 7.999954386209677e-06, "loss": -0.0994, "num_tokens": 10044380.0, "reward": 0.34555357694625854, "reward_std": 1.2500262260437012, "rewards/rollout_reward_func/mean": 0.34555357694625854, "rewards/rollout_reward_func/std": 1.2500262260437012, "sampling/importance_sampling_ratio/max": 1.7590198516845703, "sampling/importance_sampling_ratio/mean": 0.9375303983688354, "sampling/importance_sampling_ratio/min": 9.30136775423307e-06, "sampling/sampling_logp_difference/max": 2.15327787399292, "sampling/sampling_logp_difference/mean": 0.285480797290802, "step": 387, "step_time": 24.851795630034758 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0034722222480922937, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011284722248092294, "entropy": 1.1698052566498518, "epoch": 0.00388, "grad_norm": 0.09745573252439499, "kl": 0.8897196743637323, "learning_rate": 7.999954125932465e-06, "loss": -0.1005, "step": 388, "step_time": 13.527351397991879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.03125, "completions/mean_terminated_length": 4.961538791656494, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.476190285757184, "epoch": 0.00389, "frac_reward_zero_std": 0.0, "grad_norm": 0.07863315939903259, "kl": 0.6216778419911861, "learning_rate": 7.999953864914783e-06, "loss": -0.0739, "num_tokens": 10095622.0, "reward": 0.6729540824890137, "reward_std": 1.3260468244552612, "rewards/rollout_reward_func/mean": 0.6729540824890137, "rewards/rollout_reward_func/std": 1.3260468244552612, "sampling/importance_sampling_ratio/max": 1.5454914569854736, "sampling/importance_sampling_ratio/mean": 0.740081787109375, "sampling/importance_sampling_ratio/min": 4.841257759835571e-05, "sampling/sampling_logp_difference/max": 2.384779453277588, "sampling/sampling_logp_difference/mean": 0.2920745015144348, "step": 389, "step_time": 26.402447920001578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.4802735429257154, "epoch": 0.0039, "grad_norm": 0.08276739716529846, "kl": 0.6320284325629473, "learning_rate": 7.999953603156633e-06, "loss": -0.0739, "step": 390, "step_time": 13.023112493014196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 6.78125, "completions/mean_terminated_length": 4.199999809265137, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.3935389490798116, "epoch": 0.00391, "frac_reward_zero_std": 0.25, "grad_norm": 0.20168998837471008, "kl": 0.3087928779423237, "learning_rate": 7.999953340658018e-06, "loss": -0.0576, "num_tokens": 10149750.0, "reward": 0.6814820170402527, "reward_std": 1.379817247390747, "rewards/rollout_reward_func/mean": 0.6814820170402527, "rewards/rollout_reward_func/std": 1.379817247390747, "sampling/importance_sampling_ratio/max": 1.7001197338104248, "sampling/importance_sampling_ratio/mean": 0.8411973714828491, "sampling/importance_sampling_ratio/min": 5.3939284043735825e-06, "sampling/sampling_logp_difference/max": 1.7696927785873413, "sampling/sampling_logp_difference/mean": 0.31171244382858276, "step": 391, "step_time": 27.17150598896842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3885063640773296, "epoch": 0.00392, "grad_norm": 0.1894741654396057, "kl": 0.30715507827699184, "learning_rate": 7.999953077418933e-06, "loss": -0.0578, "step": 392, "step_time": 14.3688216789742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.75, "completions/mean_terminated_length": 4.689655303955078, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0321226976811886, "epoch": 0.00393, "frac_reward_zero_std": 0.25, "grad_norm": 0.18292467296123505, "kl": 0.7966316360980272, "learning_rate": 7.99995281343938e-06, "loss": -0.0119, "num_tokens": 10195741.0, "reward": 0.34434646368026733, "reward_std": 1.3672451972961426, "rewards/rollout_reward_func/mean": 0.34434646368026733, "rewards/rollout_reward_func/std": 1.3672453165054321, "sampling/importance_sampling_ratio/max": 1.3605597019195557, "sampling/importance_sampling_ratio/mean": 0.8794551491737366, "sampling/importance_sampling_ratio/min": 1.0632843441271689e-05, "sampling/sampling_logp_difference/max": 1.7429046630859375, "sampling/sampling_logp_difference/mean": 0.2410302460193634, "step": 393, "step_time": 22.536659963036072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0424938015639782, "epoch": 0.00394, "grad_norm": 0.2049431949853897, "kl": 0.8150756135582924, "learning_rate": 7.99995254871936e-06, "loss": -0.0125, "step": 394, "step_time": 12.775928564034984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 4.5714287757873535, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.019954465329647, "epoch": 0.00395, "frac_reward_zero_std": 0.0, "grad_norm": 0.10474539548158646, "kl": 0.6561288349330425, "learning_rate": 7.999952283258871e-06, "loss": -0.0733, "num_tokens": 10254417.0, "reward": 1.0915716886520386, "reward_std": 1.0417128801345825, "rewards/rollout_reward_func/mean": 1.0915716886520386, "rewards/rollout_reward_func/std": 1.041712760925293, "sampling/importance_sampling_ratio/max": 1.4916542768478394, "sampling/importance_sampling_ratio/mean": 0.8298190236091614, "sampling/importance_sampling_ratio/min": 0.0007330826483666897, "sampling/sampling_logp_difference/max": 1.8635826110839844, "sampling/sampling_logp_difference/mean": 0.23308521509170532, "step": 395, "step_time": 24.939230902018608 }, { "clip_ratio/high_max": 0.007352941203862429, "clip_ratio/high_mean": 0.0036764706019312143, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0036764706019312143, "entropy": 1.029957801103592, "epoch": 0.00396, "grad_norm": 0.10476531833410263, "kl": 0.6001408249139786, "learning_rate": 7.999952017057914e-06, "loss": -0.0735, "step": 396, "step_time": 13.444197447010083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 5.8125, "completions/mean_terminated_length": 4.357142925262451, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8747363369911909, "epoch": 0.00397, "frac_reward_zero_std": 0.0, "grad_norm": 0.0948195606470108, "kl": 0.4480116479098797, "learning_rate": 7.99995175011649e-06, "loss": -0.0589, "num_tokens": 10312337.0, "reward": 0.6336318254470825, "reward_std": 1.292299509048462, "rewards/rollout_reward_func/mean": 0.6336318254470825, "rewards/rollout_reward_func/std": 1.2922996282577515, "sampling/importance_sampling_ratio/max": 1.6611734628677368, "sampling/importance_sampling_ratio/mean": 0.9064706563949585, "sampling/importance_sampling_ratio/min": 0.0004915872123092413, "sampling/sampling_logp_difference/max": 1.6933095455169678, "sampling/sampling_logp_difference/mean": 0.20146162807941437, "step": 397, "step_time": 26.86825926300662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8821172351017594, "epoch": 0.00398, "grad_norm": 0.09448602795600891, "kl": 0.4262200016528368, "learning_rate": 7.9999514824346e-06, "loss": -0.0593, "step": 398, "step_time": 13.387642890011193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.125, "completions/mean_terminated_length": 5.481481552124023, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7226065080612898, "epoch": 0.00399, "frac_reward_zero_std": 0.0, "grad_norm": 0.04499943554401398, "kl": 0.630732785910368, "learning_rate": 7.999951214012241e-06, "loss": -0.0822, "num_tokens": 10365702.0, "reward": 0.2795509099960327, "reward_std": 1.2262331247329712, "rewards/rollout_reward_func/mean": 0.2795509099960327, "rewards/rollout_reward_func/std": 1.2262331247329712, "sampling/importance_sampling_ratio/max": 1.421512484550476, "sampling/importance_sampling_ratio/mean": 0.7153156995773315, "sampling/importance_sampling_ratio/min": 1.5415525922435336e-05, "sampling/sampling_logp_difference/max": 2.2980895042419434, "sampling/sampling_logp_difference/mean": 0.3335561156272888, "step": 399, "step_time": 26.575809273999766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7333077993243933, "epoch": 0.004, "grad_norm": 0.05589369684457779, "kl": 0.5984352827072144, "learning_rate": 7.999950944849416e-06, "loss": -0.0821, "step": 400, "step_time": 13.431419756976538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 8.0625, "completions/mean_terminated_length": 4.956521987915039, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.0776754170656204, "epoch": 0.00401, "frac_reward_zero_std": 0.0, "grad_norm": 0.02872040867805481, "kl": 0.272662153467536, "learning_rate": 7.999950674946121e-06, "loss": -0.0764, "num_tokens": 10419942.0, "reward": 0.7039344310760498, "reward_std": 1.3520112037658691, "rewards/rollout_reward_func/mean": 0.7039344310760498, "rewards/rollout_reward_func/std": 1.3520112037658691, "sampling/importance_sampling_ratio/max": 1.3247826099395752, "sampling/importance_sampling_ratio/mean": 0.7204737067222595, "sampling/importance_sampling_ratio/min": 1.7654816986123478e-07, "sampling/sampling_logp_difference/max": 2.2359800338745117, "sampling/sampling_logp_difference/mean": 0.4274958372116089, "step": 401, "step_time": 28.798576623012195 }, { "clip_ratio/high_max": 0.0062500000931322575, "clip_ratio/high_mean": 0.0031250000465661287, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 2.077050969004631, "epoch": 0.00402, "grad_norm": 0.024849973618984222, "kl": 0.2547404682263732, "learning_rate": 7.99995040430236e-06, "loss": -0.0764, "step": 402, "step_time": 15.603438960955827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 8.53125, "completions/mean_terminated_length": 4.050000190734863, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6081926748156548, "epoch": 0.00403, "frac_reward_zero_std": 0.0, "grad_norm": 0.0954269990324974, "kl": 0.2918041367083788, "learning_rate": 7.999950132918132e-06, "loss": -0.0892, "num_tokens": 10471785.0, "reward": 0.587649941444397, "reward_std": 1.30745267868042, "rewards/rollout_reward_func/mean": 0.587649941444397, "rewards/rollout_reward_func/std": 1.30745267868042, "sampling/importance_sampling_ratio/max": 1.3132809400558472, "sampling/importance_sampling_ratio/mean": 0.6579978466033936, "sampling/importance_sampling_ratio/min": 1.625721597520169e-05, "sampling/sampling_logp_difference/max": 1.5531761646270752, "sampling/sampling_logp_difference/mean": 0.3172294497489929, "step": 403, "step_time": 30.925163511987193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6119658555835485, "epoch": 0.00404, "grad_norm": 0.10570967197418213, "kl": 0.290954124648124, "learning_rate": 7.999949860793436e-06, "loss": -0.0893, "step": 404, "step_time": 15.129370137976366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 4.363636493682861, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.007252473384142, "epoch": 0.00405, "frac_reward_zero_std": 0.0, "grad_norm": 0.0817502811551094, "kl": 0.2138919997960329, "learning_rate": 7.999949587928276e-06, "loss": -0.0914, "num_tokens": 10527891.0, "reward": 0.38041579723358154, "reward_std": 1.1646082401275635, "rewards/rollout_reward_func/mean": 0.38041579723358154, "rewards/rollout_reward_func/std": 1.1646082401275635, "sampling/importance_sampling_ratio/max": 1.5282491445541382, "sampling/importance_sampling_ratio/mean": 0.8091932535171509, "sampling/importance_sampling_ratio/min": 7.619134407832462e-07, "sampling/sampling_logp_difference/max": 2.1573615074157715, "sampling/sampling_logp_difference/mean": 0.3591635823249817, "step": 405, "step_time": 27.358199657013756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0151425935328007, "epoch": 0.00406, "grad_norm": 0.09015364944934845, "kl": 0.22525588609278202, "learning_rate": 7.999949314322646e-06, "loss": -0.0913, "step": 406, "step_time": 13.85633354599122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 4.857142925262451, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1616003643721342, "epoch": 0.00407, "frac_reward_zero_std": 0.0, "grad_norm": 0.06171518564224243, "kl": 1.0414139218628407, "learning_rate": 7.999949039976548e-06, "loss": -0.0771, "num_tokens": 10569074.0, "reward": 1.0860941410064697, "reward_std": 1.189934492111206, "rewards/rollout_reward_func/mean": 1.0860941410064697, "rewards/rollout_reward_func/std": 1.189934492111206, "sampling/importance_sampling_ratio/max": 1.2437151670455933, "sampling/importance_sampling_ratio/mean": 0.7990647554397583, "sampling/importance_sampling_ratio/min": 2.68772214440105e-06, "sampling/sampling_logp_difference/max": 1.8587311506271362, "sampling/sampling_logp_difference/mean": 0.2967439889907837, "step": 407, "step_time": 20.59950357400521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.158192165195942, "epoch": 0.00408, "grad_norm": 0.05518047884106636, "kl": 0.9973134994506836, "learning_rate": 7.999948764889987e-06, "loss": -0.0773, "step": 408, "step_time": 10.879893325007288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.625, "completions/mean_terminated_length": 4.461538791656494, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3191112522035837, "epoch": 0.00409, "frac_reward_zero_std": 0.0, "grad_norm": 0.17194968461990356, "kl": 0.44573484966531396, "learning_rate": 7.999948489062955e-06, "loss": -0.0645, "num_tokens": 10628510.0, "reward": 0.6063201427459717, "reward_std": 1.190951943397522, "rewards/rollout_reward_func/mean": 0.6063201427459717, "rewards/rollout_reward_func/std": 1.190951943397522, "sampling/importance_sampling_ratio/max": 1.4349205493927002, "sampling/importance_sampling_ratio/mean": 0.819021999835968, "sampling/importance_sampling_ratio/min": 8.861796914061415e-07, "sampling/sampling_logp_difference/max": 2.127673387527466, "sampling/sampling_logp_difference/mean": 0.29982614517211914, "step": 409, "step_time": 26.153951895976206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3176155481487513, "epoch": 0.0041, "grad_norm": 0.15490135550498962, "kl": 0.43816096568480134, "learning_rate": 7.99994821249546e-06, "loss": -0.0654, "step": 410, "step_time": 12.332026759002474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.78125, "completions/mean_terminated_length": 4.653846263885498, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6499263979494572, "epoch": 0.00411, "frac_reward_zero_std": 0.0, "grad_norm": 0.12533625960350037, "kl": 0.33750917948782444, "learning_rate": 7.999947935187496e-06, "loss": -0.049, "num_tokens": 10685031.0, "reward": 0.5876490473747253, "reward_std": 1.2055102586746216, "rewards/rollout_reward_func/mean": 0.5876490473747253, "rewards/rollout_reward_func/std": 1.2055102586746216, "sampling/importance_sampling_ratio/max": 1.457763910293579, "sampling/importance_sampling_ratio/mean": 0.8634284734725952, "sampling/importance_sampling_ratio/min": 8.529335571161312e-10, "sampling/sampling_logp_difference/max": 2.7135162353515625, "sampling/sampling_logp_difference/mean": 0.43852221965789795, "step": 411, "step_time": 28.398982498023543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6556673124432564, "epoch": 0.00412, "grad_norm": 0.11888760328292847, "kl": 0.3247950542718172, "learning_rate": 7.999947657139067e-06, "loss": -0.0492, "step": 412, "step_time": 15.871529245006968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.21875, "completions/mean_terminated_length": 4.870967388153076, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.990737016312778, "epoch": 0.00413, "frac_reward_zero_std": 0.25, "grad_norm": 0.09564680606126785, "kl": 0.5155561603605747, "learning_rate": 7.99994737835017e-06, "loss": -0.0519, "num_tokens": 10730525.0, "reward": 1.108884572982788, "reward_std": 1.0713528394699097, "rewards/rollout_reward_func/mean": 1.108884572982788, "rewards/rollout_reward_func/std": 1.0713528394699097, "sampling/importance_sampling_ratio/max": 1.5719926357269287, "sampling/importance_sampling_ratio/mean": 0.9634772539138794, "sampling/importance_sampling_ratio/min": 2.6424733732710592e-05, "sampling/sampling_logp_difference/max": 2.072664499282837, "sampling/sampling_logp_difference/mean": 0.24890179932117462, "step": 413, "step_time": 20.629188640989014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0000562742352486, "epoch": 0.00414, "grad_norm": 0.09301656484603882, "kl": 0.5091223753988743, "learning_rate": 7.999947098820806e-06, "loss": -0.0521, "step": 414, "step_time": 11.84339216799708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.59375, "completions/mean_terminated_length": 5.227272987365723, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8760701268911362, "epoch": 0.00415, "frac_reward_zero_std": 0.25, "grad_norm": 0.03720127418637276, "kl": 0.2603509007021785, "learning_rate": 7.999946818550977e-06, "loss": -0.0855, "num_tokens": 10785652.0, "reward": 0.7437446117401123, "reward_std": 1.32534658908844, "rewards/rollout_reward_func/mean": 0.7437446117401123, "rewards/rollout_reward_func/std": 1.3253467082977295, "sampling/importance_sampling_ratio/max": 1.2731167078018188, "sampling/importance_sampling_ratio/mean": 0.6151260137557983, "sampling/importance_sampling_ratio/min": 5.345027602743357e-07, "sampling/sampling_logp_difference/max": 2.4057040214538574, "sampling/sampling_logp_difference/mean": 0.35856977105140686, "step": 415, "step_time": 31.925514094007667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8781000822782516, "epoch": 0.00416, "grad_norm": 0.042643170803785324, "kl": 0.30652804719284177, "learning_rate": 7.99994653754068e-06, "loss": -0.0854, "step": 416, "step_time": 15.732688995019998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.375, "completions/mean_terminated_length": 4.592592716217041, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4084629230201244, "epoch": 0.00417, "frac_reward_zero_std": 0.0, "grad_norm": 0.0869283601641655, "kl": 0.4791871327906847, "learning_rate": 7.999946255789918e-06, "loss": -0.097, "num_tokens": 10839802.0, "reward": 0.5616232752799988, "reward_std": 1.2798107862472534, "rewards/rollout_reward_func/mean": 0.5616232752799988, "rewards/rollout_reward_func/std": 1.2798106670379639, "sampling/importance_sampling_ratio/max": 1.3506834506988525, "sampling/importance_sampling_ratio/mean": 0.7693475484848022, "sampling/importance_sampling_ratio/min": 5.9633504861267284e-05, "sampling/sampling_logp_difference/max": 1.6615791320800781, "sampling/sampling_logp_difference/mean": 0.28829488158226013, "step": 417, "step_time": 26.226426622000872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4056939240545034, "epoch": 0.00418, "grad_norm": 0.07240977883338928, "kl": 0.48125865682959557, "learning_rate": 7.99994597329869e-06, "loss": -0.097, "step": 418, "step_time": 13.55421373900026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.65625, "completions/mean_terminated_length": 5.315789699554443, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.1857619881629944, "epoch": 0.00419, "frac_reward_zero_std": 0.0, "grad_norm": 0.0702705979347229, "kl": 0.35785191459581256, "learning_rate": 7.999945690066996e-06, "loss": -0.0854, "num_tokens": 10906567.0, "reward": 0.07915473729372025, "reward_std": 1.0449275970458984, "rewards/rollout_reward_func/mean": 0.07915473729372025, "rewards/rollout_reward_func/std": 1.0449275970458984, "sampling/importance_sampling_ratio/max": 1.7006690502166748, "sampling/importance_sampling_ratio/mean": 0.5263004302978516, "sampling/importance_sampling_ratio/min": 5.9540102625987856e-08, "sampling/sampling_logp_difference/max": 2.3795433044433594, "sampling/sampling_logp_difference/mean": 0.3976131081581116, "step": 419, "step_time": 32.898516096000094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.1831135377287865, "epoch": 0.0042, "grad_norm": 0.06928055733442307, "kl": 0.36349645303562284, "learning_rate": 7.999945406094835e-06, "loss": -0.0856, "step": 420, "step_time": 16.024653630011017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.625, "completions/mean_terminated_length": 5.800000190734863, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.0149748101830482, "epoch": 0.00421, "frac_reward_zero_std": 0.0, "grad_norm": 0.047224607318639755, "kl": 0.21002719178795815, "learning_rate": 7.999945121382207e-06, "loss": -0.0932, "num_tokens": 10962176.0, "reward": 0.6025897264480591, "reward_std": 1.3050299882888794, "rewards/rollout_reward_func/mean": 0.6025897264480591, "rewards/rollout_reward_func/std": 1.3050299882888794, "sampling/importance_sampling_ratio/max": 1.3115277290344238, "sampling/importance_sampling_ratio/mean": 0.5673472881317139, "sampling/importance_sampling_ratio/min": 5.174073976377258e-06, "sampling/sampling_logp_difference/max": 2.1228690147399902, "sampling/sampling_logp_difference/mean": 0.3743932843208313, "step": 421, "step_time": 26.35986063501332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.018820159137249, "epoch": 0.00422, "grad_norm": 0.04001188650727272, "kl": 0.2158646136522293, "learning_rate": 7.999944835929116e-06, "loss": -0.0932, "step": 422, "step_time": 12.927649396995548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 9.09375, "completions/mean_terminated_length": 5.476190567016602, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.555800274014473, "epoch": 0.00423, "frac_reward_zero_std": 0.0, "grad_norm": 0.3087492287158966, "kl": 0.3843530286103487, "learning_rate": 7.999944549735557e-06, "loss": -0.0871, "num_tokens": 11023011.0, "reward": 0.061739444732666016, "reward_std": 1.1344397068023682, "rewards/rollout_reward_func/mean": 0.061739444732666016, "rewards/rollout_reward_func/std": 1.1344397068023682, "sampling/importance_sampling_ratio/max": 1.6648757457733154, "sampling/importance_sampling_ratio/mean": 0.5461822748184204, "sampling/importance_sampling_ratio/min": 5.992038865088034e-08, "sampling/sampling_logp_difference/max": 2.1113498210906982, "sampling/sampling_logp_difference/mean": 0.4581987261772156, "step": 423, "step_time": 33.39771185700374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 2.5465717762708664, "epoch": 0.00424, "grad_norm": 0.2726828455924988, "kl": 0.3877413850277662, "learning_rate": 7.999944262801533e-06, "loss": -0.0895, "step": 424, "step_time": 16.425209314984386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.1875, "completions/mean_terminated_length": 5.130434989929199, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6152094528079033, "epoch": 0.00425, "frac_reward_zero_std": 0.0, "grad_norm": 0.058290183544158936, "kl": 0.3534715627320111, "learning_rate": 7.999943975127043e-06, "loss": -0.0871, "num_tokens": 11082712.0, "reward": 0.4399077892303467, "reward_std": 1.2758357524871826, "rewards/rollout_reward_func/mean": 0.4399077892303467, "rewards/rollout_reward_func/std": 1.2758357524871826, "sampling/importance_sampling_ratio/max": 1.3607624769210815, "sampling/importance_sampling_ratio/mean": 0.6596513986587524, "sampling/importance_sampling_ratio/min": 2.04454821073341e-07, "sampling/sampling_logp_difference/max": 2.1853702068328857, "sampling/sampling_logp_difference/mean": 0.36109358072280884, "step": 425, "step_time": 26.367573772004107 }, { "clip_ratio/high_max": 0.0062500000931322575, "clip_ratio/high_mean": 0.0031250000465661287, "clip_ratio/low_mean": 0.002016128972172737, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005141129018738866, "entropy": 1.60313855484128, "epoch": 0.00426, "grad_norm": 0.047726161777973175, "kl": 0.380282792262733, "learning_rate": 7.999943686712088e-06, "loss": -0.0873, "step": 426, "step_time": 12.809168387990212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.6875, "completions/mean_terminated_length": 4.214285850524902, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8581882547587156, "epoch": 0.00427, "frac_reward_zero_std": 0.0, "grad_norm": 0.11894157528877258, "kl": 0.8199393972754478, "learning_rate": 7.999943397556666e-06, "loss": -0.0769, "num_tokens": 11131325.0, "reward": 1.0426949262619019, "reward_std": 1.2807536125183105, "rewards/rollout_reward_func/mean": 1.0426949262619019, "rewards/rollout_reward_func/std": 1.280753493309021, "sampling/importance_sampling_ratio/max": 1.3999568223953247, "sampling/importance_sampling_ratio/mean": 0.8498093485832214, "sampling/importance_sampling_ratio/min": 0.0001538518990855664, "sampling/sampling_logp_difference/max": 2.427450180053711, "sampling/sampling_logp_difference/mean": 0.21648532152175903, "step": 427, "step_time": 24.930820272973506 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.8490011338144541, "epoch": 0.00428, "grad_norm": 0.050252288579940796, "kl": 0.8689598850905895, "learning_rate": 7.99994310766078e-06, "loss": -0.0771, "step": 428, "step_time": 13.755669858990586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005078125046566129, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005078125046566129, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1491924822330475, "epoch": 0.00429, "frac_reward_zero_std": 0.0, "grad_norm": 0.1925010085105896, "kl": 0.3216321673244238, "learning_rate": 7.999942817024428e-06, "loss": -0.0663, "num_tokens": 11189854.0, "reward": 0.4280496835708618, "reward_std": 1.2165603637695312, "rewards/rollout_reward_func/mean": 0.4280496835708618, "rewards/rollout_reward_func/std": 1.2165603637695312, "sampling/importance_sampling_ratio/max": 1.2945109605789185, "sampling/importance_sampling_ratio/mean": 0.7757672071456909, "sampling/importance_sampling_ratio/min": 0.00018061333685182035, "sampling/sampling_logp_difference/max": 2.465549945831299, "sampling/sampling_logp_difference/mean": 0.23178108036518097, "step": 429, "step_time": 30.471529378002742 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.1439002249389887, "epoch": 0.0043, "grad_norm": 0.1255691796541214, "kl": 0.24435014463961124, "learning_rate": 7.99994252564761e-06, "loss": -0.0667, "step": 430, "step_time": 15.622166989996913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.65625, "completions/mean_terminated_length": 5.319999694824219, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5528814606368542, "epoch": 0.00431, "frac_reward_zero_std": 0.0, "grad_norm": 0.061515066772699356, "kl": 0.9560782480984926, "learning_rate": 7.999942233530327e-06, "loss": -0.0886, "num_tokens": 11238239.0, "reward": 0.8750709295272827, "reward_std": 1.2659454345703125, "rewards/rollout_reward_func/mean": 0.8750709295272827, "rewards/rollout_reward_func/std": 1.2659454345703125, "sampling/importance_sampling_ratio/max": 1.5244203805923462, "sampling/importance_sampling_ratio/mean": 0.6757848262786865, "sampling/importance_sampling_ratio/min": 3.441292761863224e-08, "sampling/sampling_logp_difference/max": 2.8538479804992676, "sampling/sampling_logp_difference/mean": 0.3677852749824524, "step": 431, "step_time": 27.322607736015925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.547565996646881, "epoch": 0.00432, "grad_norm": 0.07346435636281967, "kl": 1.029000772163272, "learning_rate": 7.999941940672578e-06, "loss": -0.0884, "step": 432, "step_time": 14.267724583027302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 5.875, "completions/mean_terminated_length": 4.428571701049805, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8760283291339874, "epoch": 0.00433, "frac_reward_zero_std": 0.0, "grad_norm": 0.24454240500926971, "kl": 1.41189706325531, "learning_rate": 7.999941647074366e-06, "loss": -0.0237, "num_tokens": 11300403.0, "reward": 0.8008041977882385, "reward_std": 1.1850192546844482, "rewards/rollout_reward_func/mean": 0.8008041977882385, "rewards/rollout_reward_func/std": 1.1850193738937378, "sampling/importance_sampling_ratio/max": 1.2865480184555054, "sampling/importance_sampling_ratio/mean": 0.8778393268585205, "sampling/importance_sampling_ratio/min": 0.00180334341712296, "sampling/sampling_logp_difference/max": 2.1517422199249268, "sampling/sampling_logp_difference/mean": 0.1994747519493103, "step": 433, "step_time": 29.02431712199177 }, { "clip_ratio/high_max": 0.014880952890962362, "clip_ratio/high_mean": 0.007440476445481181, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007440476445481181, "entropy": 0.8869221620261669, "epoch": 0.00434, "grad_norm": 0.2500395178794861, "kl": 1.125379092991352, "learning_rate": 7.999941352735688e-06, "loss": -0.0254, "step": 434, "step_time": 15.815849019985762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.15625, "completions/mean_terminated_length": 5.15625, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.721178631298244, "epoch": 0.00435, "frac_reward_zero_std": 0.25, "grad_norm": 0.08230883628129959, "kl": 0.5978588052093983, "learning_rate": 7.999941057656543e-06, "loss": -0.0492, "num_tokens": 11353401.0, "reward": 1.312035322189331, "reward_std": 1.0635077953338623, "rewards/rollout_reward_func/mean": 1.312035322189331, "rewards/rollout_reward_func/std": 1.0635076761245728, "sampling/importance_sampling_ratio/max": 1.158815860748291, "sampling/importance_sampling_ratio/mean": 0.8905472159385681, "sampling/importance_sampling_ratio/min": 0.0006041783490218222, "sampling/sampling_logp_difference/max": 1.9659547805786133, "sampling/sampling_logp_difference/mean": 0.164344921708107, "step": 435, "step_time": 22.42827209898678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7289969837293029, "epoch": 0.00436, "grad_norm": 0.08066917210817337, "kl": 0.5518095288425684, "learning_rate": 7.999940761836937e-06, "loss": -0.0492, "step": 436, "step_time": 12.33269057898724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 4.84375, "completions/mean_terminated_length": 4.100000381469727, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.6574850901961327, "epoch": 0.00437, "frac_reward_zero_std": 0.5, "grad_norm": 0.06390692293643951, "kl": 0.3575263563543558, "learning_rate": 7.999940465276862e-06, "loss": -0.026, "num_tokens": 11388890.0, "reward": 1.4583637714385986, "reward_std": 1.0099879503250122, "rewards/rollout_reward_func/mean": 1.4583637714385986, "rewards/rollout_reward_func/std": 1.0099878311157227, "sampling/importance_sampling_ratio/max": 1.377995491027832, "sampling/importance_sampling_ratio/mean": 0.9855993390083313, "sampling/importance_sampling_ratio/min": 1.3539335668610875e-05, "sampling/sampling_logp_difference/max": 1.7671643495559692, "sampling/sampling_logp_difference/mean": 0.15895482897758484, "step": 437, "step_time": 17.67962052997609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6644884310662746, "epoch": 0.00438, "grad_norm": 0.059094615280628204, "kl": 0.35916418209671974, "learning_rate": 7.999940167976326e-06, "loss": -0.0262, "step": 438, "step_time": 10.606344313011505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.6875, "completions/mean_terminated_length": 4.962963104248047, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.094541322439909, "epoch": 0.00439, "frac_reward_zero_std": 0.0, "grad_norm": 0.09949468821287155, "kl": 0.5037675024941564, "learning_rate": 7.999939869935323e-06, "loss": -0.0737, "num_tokens": 11443006.0, "reward": 0.7746108174324036, "reward_std": 1.266330361366272, "rewards/rollout_reward_func/mean": 0.7746108174324036, "rewards/rollout_reward_func/std": 1.266330361366272, "sampling/importance_sampling_ratio/max": 1.2965484857559204, "sampling/importance_sampling_ratio/mean": 0.7579382061958313, "sampling/importance_sampling_ratio/min": 0.0017358965706080198, "sampling/sampling_logp_difference/max": 1.2723844051361084, "sampling/sampling_logp_difference/mean": 0.19200357794761658, "step": 439, "step_time": 26.148947758993017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.096109351143241, "epoch": 0.0044, "grad_norm": 0.03288863226771355, "kl": 0.46512441569939256, "learning_rate": 7.999939571153855e-06, "loss": -0.0739, "step": 440, "step_time": 13.663931700997637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.96875, "completions/mean_terminated_length": 4.931034564971924, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0793896950781345, "epoch": 0.00441, "frac_reward_zero_std": 0.0, "grad_norm": 0.05960961431264877, "kl": 0.42010126262903214, "learning_rate": 7.999939271631924e-06, "loss": -0.0704, "num_tokens": 11484256.0, "reward": 1.2581136226654053, "reward_std": 1.2150710821151733, "rewards/rollout_reward_func/mean": 1.2581136226654053, "rewards/rollout_reward_func/std": 1.2150710821151733, "sampling/importance_sampling_ratio/max": 1.2502459287643433, "sampling/importance_sampling_ratio/mean": 0.8109211921691895, "sampling/importance_sampling_ratio/min": 4.3778934923466295e-05, "sampling/sampling_logp_difference/max": 1.3087854385375977, "sampling/sampling_logp_difference/mean": 0.23708394169807434, "step": 441, "step_time": 19.48593679802434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0808862457051873, "epoch": 0.00442, "grad_norm": 0.05453967675566673, "kl": 0.42625780031085014, "learning_rate": 7.999938971369529e-06, "loss": -0.0705, "step": 442, "step_time": 11.05517124904145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.53125, "completions/mean_terminated_length": 4.34615421295166, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.3605419062078, "epoch": 0.00443, "frac_reward_zero_std": 0.0, "grad_norm": 0.11698685586452484, "kl": 0.2732095420360565, "learning_rate": 7.999938670366667e-06, "loss": -0.0724, "num_tokens": 11538690.0, "reward": 0.3800525665283203, "reward_std": 1.2870274782180786, "rewards/rollout_reward_func/mean": 0.3800525665283203, "rewards/rollout_reward_func/std": 1.2870274782180786, "sampling/importance_sampling_ratio/max": 1.2211843729019165, "sampling/importance_sampling_ratio/mean": 0.7918359041213989, "sampling/importance_sampling_ratio/min": 5.043558849138208e-05, "sampling/sampling_logp_difference/max": 1.798606276512146, "sampling/sampling_logp_difference/mean": 0.2586485743522644, "step": 443, "step_time": 25.763210079006967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.364811271429062, "epoch": 0.00444, "grad_norm": 0.07822208106517792, "kl": 0.2827966772019863, "learning_rate": 7.999938368623343e-06, "loss": -0.0725, "step": 444, "step_time": 12.819366937008454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.125, "completions/mean_terminated_length": 5.103448390960693, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3808388859033585, "epoch": 0.00445, "frac_reward_zero_std": 0.25, "grad_norm": 0.09428074955940247, "kl": 0.6288446392863989, "learning_rate": 7.999938066139555e-06, "loss": -0.0551, "num_tokens": 11590421.0, "reward": 0.8362398147583008, "reward_std": 1.3203457593917847, "rewards/rollout_reward_func/mean": 0.8362398147583008, "rewards/rollout_reward_func/std": 1.3203457593917847, "sampling/importance_sampling_ratio/max": 1.4611669778823853, "sampling/importance_sampling_ratio/mean": 0.8145240545272827, "sampling/importance_sampling_ratio/min": 7.694058876950294e-05, "sampling/sampling_logp_difference/max": 1.856740951538086, "sampling/sampling_logp_difference/mean": 0.24622055888175964, "step": 445, "step_time": 26.438954812008888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3937089331448078, "epoch": 0.00446, "grad_norm": 0.09408678859472275, "kl": 0.608628561720252, "learning_rate": 7.9999377629153e-06, "loss": -0.0554, "step": 446, "step_time": 13.823462057989673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 7.875, "completions/mean_terminated_length": 4.695652484893799, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.8030090257525444, "epoch": 0.00447, "frac_reward_zero_std": 0.0, "grad_norm": 0.10345807671546936, "kl": 0.23997887782752514, "learning_rate": 7.999937458950583e-06, "loss": -0.0765, "num_tokens": 11647416.0, "reward": 0.2744188904762268, "reward_std": 1.1892369985580444, "rewards/rollout_reward_func/mean": 0.2744188904762268, "rewards/rollout_reward_func/std": 1.1892368793487549, "sampling/importance_sampling_ratio/max": 1.7295136451721191, "sampling/importance_sampling_ratio/mean": 0.6294227838516235, "sampling/importance_sampling_ratio/min": 4.497026384342462e-05, "sampling/sampling_logp_difference/max": 1.6459925174713135, "sampling/sampling_logp_difference/mean": 0.33753928542137146, "step": 447, "step_time": 26.434921489984845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.8095491230487823, "epoch": 0.00448, "grad_norm": 0.09646341949701309, "kl": 0.237864563241601, "learning_rate": 7.999937154245402e-06, "loss": -0.0765, "step": 448, "step_time": 13.032396234004409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.65625, "completions/mean_terminated_length": 4.178571701049805, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0977578982710838, "epoch": 0.00449, "frac_reward_zero_std": 0.0, "grad_norm": 0.08788015693426132, "kl": 1.1822762954980135, "learning_rate": 7.999936848799756e-06, "loss": -0.0907, "num_tokens": 11685609.0, "reward": 0.5203649997711182, "reward_std": 1.2779252529144287, "rewards/rollout_reward_func/mean": 0.5203649997711182, "rewards/rollout_reward_func/std": 1.2779252529144287, "sampling/importance_sampling_ratio/max": 1.1389968395233154, "sampling/importance_sampling_ratio/mean": 0.8604111075401306, "sampling/importance_sampling_ratio/min": 1.0578751243883744e-05, "sampling/sampling_logp_difference/max": 1.9802908897399902, "sampling/sampling_logp_difference/mean": 0.23854875564575195, "step": 449, "step_time": 19.02904503598984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0999586526304483, "epoch": 0.0045, "grad_norm": 0.10073187202215195, "kl": 1.1141001246869564, "learning_rate": 7.999936542613647e-06, "loss": -0.0908, "step": 450, "step_time": 10.160348384015379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 4.307692527770996, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.1233605593442917, "epoch": 0.00451, "frac_reward_zero_std": 0.25, "grad_norm": 0.15409298241138458, "kl": 0.564220504835248, "learning_rate": 7.999936235687075e-06, "loss": -0.0458, "num_tokens": 11739894.0, "reward": 0.668694019317627, "reward_std": 1.2980217933654785, "rewards/rollout_reward_func/mean": 0.668694019317627, "rewards/rollout_reward_func/std": 1.2980217933654785, "sampling/importance_sampling_ratio/max": 1.346974492073059, "sampling/importance_sampling_ratio/mean": 0.7791138291358948, "sampling/importance_sampling_ratio/min": 2.0314815628807992e-05, "sampling/sampling_logp_difference/max": 2.367201328277588, "sampling/sampling_logp_difference/mean": 0.23265516757965088, "step": 451, "step_time": 26.40877680799167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.122017253190279, "epoch": 0.00452, "grad_norm": 0.13047046959400177, "kl": 0.5796921215951443, "learning_rate": 7.999935928020036e-06, "loss": -0.0462, "step": 452, "step_time": 13.150720901991008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.09375, "completions/mean_terminated_length": 4.599999904632568, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5774746984243393, "epoch": 0.00453, "frac_reward_zero_std": 0.25, "grad_norm": 0.23570406436920166, "kl": 0.3503830572590232, "learning_rate": 7.999935619612536e-06, "loss": -0.0641, "num_tokens": 11793128.0, "reward": 0.7212712168693542, "reward_std": 1.2856285572052002, "rewards/rollout_reward_func/mean": 0.7212712168693542, "rewards/rollout_reward_func/std": 1.2856285572052002, "sampling/importance_sampling_ratio/max": 1.9689979553222656, "sampling/importance_sampling_ratio/mean": 0.7813981771469116, "sampling/importance_sampling_ratio/min": 1.1570838154284502e-07, "sampling/sampling_logp_difference/max": 2.5811986923217773, "sampling/sampling_logp_difference/mean": 0.3927673101425171, "step": 453, "step_time": 27.437538860016502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.003289473708719015, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003289473708719015, "entropy": 1.564395410940051, "epoch": 0.00454, "grad_norm": 0.08419816941022873, "kl": 0.344206091016531, "learning_rate": 7.999935310464572e-06, "loss": -0.0654, "step": 454, "step_time": 12.561691522030742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.65625, "completions/mean_terminated_length": 5.3214287757873535, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.3630571831017733, "epoch": 0.00455, "frac_reward_zero_std": 0.25, "grad_norm": 0.13038761913776398, "kl": 0.4235863834619522, "learning_rate": 7.999935000576144e-06, "loss": -0.0658, "num_tokens": 11846281.0, "reward": 0.7092806100845337, "reward_std": 1.3960583209991455, "rewards/rollout_reward_func/mean": 0.7092806100845337, "rewards/rollout_reward_func/std": 1.3960583209991455, "sampling/importance_sampling_ratio/max": 1.358339548110962, "sampling/importance_sampling_ratio/mean": 0.7058398723602295, "sampling/importance_sampling_ratio/min": 0.0001888327533379197, "sampling/sampling_logp_difference/max": 1.6537861824035645, "sampling/sampling_logp_difference/mean": 0.26293104887008667, "step": 455, "step_time": 28.07011695599067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.3613724187016487, "epoch": 0.00456, "grad_norm": 0.11307612806558609, "kl": 0.429143488407135, "learning_rate": 7.999934689947254e-06, "loss": -0.0662, "step": 456, "step_time": 14.369909663015278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 5.15625, "completions/mean_terminated_length": 4.806451320648193, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.7250810880213976, "epoch": 0.00457, "frac_reward_zero_std": 0.0, "grad_norm": 0.19592711329460144, "kl": 0.3600617107003927, "learning_rate": 7.999934378577899e-06, "loss": -0.0472, "num_tokens": 11884256.0, "reward": 0.48650598526000977, "reward_std": 1.5022187232971191, "rewards/rollout_reward_func/mean": 0.48650598526000977, "rewards/rollout_reward_func/std": 1.5022188425064087, "sampling/importance_sampling_ratio/max": 1.2659556865692139, "sampling/importance_sampling_ratio/mean": 0.9408397078514099, "sampling/importance_sampling_ratio/min": 5.707570380764082e-06, "sampling/sampling_logp_difference/max": 1.784334659576416, "sampling/sampling_logp_difference/mean": 0.1969248652458191, "step": 457, "step_time": 18.424654804024613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7139457818120718, "epoch": 0.00458, "grad_norm": 0.18289966881275177, "kl": 0.3565381392836571, "learning_rate": 7.999934066468082e-06, "loss": -0.047, "step": 458, "step_time": 10.218523703020765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.9375, "completions/mean_terminated_length": 5.259259223937988, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3917617872357368, "epoch": 0.00459, "frac_reward_zero_std": 0.0, "grad_norm": 0.08043952286243439, "kl": 0.42355852015316486, "learning_rate": 7.9999337536178e-06, "loss": -0.093, "num_tokens": 11945185.0, "reward": 1.1962968111038208, "reward_std": 1.098888635635376, "rewards/rollout_reward_func/mean": 1.1962968111038208, "rewards/rollout_reward_func/std": 1.098888635635376, "sampling/importance_sampling_ratio/max": 1.438449501991272, "sampling/importance_sampling_ratio/mean": 0.7723736763000488, "sampling/importance_sampling_ratio/min": 5.032371063862229e-06, "sampling/sampling_logp_difference/max": 2.4746153354644775, "sampling/sampling_logp_difference/mean": 0.26692378520965576, "step": 459, "step_time": 26.98716479499126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3895797207951546, "epoch": 0.0046, "grad_norm": 0.07756901532411575, "kl": 0.42835704796016216, "learning_rate": 7.999933440027056e-06, "loss": -0.0931, "step": 460, "step_time": 13.618471975016291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 9.0625, "completions/mean_terminated_length": 5.909090995788574, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.4252075403928757, "epoch": 0.00461, "frac_reward_zero_std": 0.0, "grad_norm": 0.11267035454511642, "kl": 0.45977816451340914, "learning_rate": 7.999933125695849e-06, "loss": -0.0795, "num_tokens": 12002271.0, "reward": -0.18177485466003418, "reward_std": 1.1411656141281128, "rewards/rollout_reward_func/mean": -0.18177485466003418, "rewards/rollout_reward_func/std": 1.1411656141281128, "sampling/importance_sampling_ratio/max": 1.3133054971694946, "sampling/importance_sampling_ratio/mean": 0.48842647671699524, "sampling/importance_sampling_ratio/min": 9.08707170310663e-06, "sampling/sampling_logp_difference/max": 1.975367546081543, "sampling/sampling_logp_difference/mean": 0.4037695527076721, "step": 461, "step_time": 26.88290485199832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.4206739962100983, "epoch": 0.00462, "grad_norm": 0.11157257109880447, "kl": 0.47851778473705053, "learning_rate": 7.99993281062418e-06, "loss": -0.0796, "step": 462, "step_time": 13.25839911098592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.78125, "completions/mean_terminated_length": 4.3214287757873535, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1960998140275478, "epoch": 0.00463, "frac_reward_zero_std": 0.25, "grad_norm": 0.08446156978607178, "kl": 0.6579389506950974, "learning_rate": 7.999932494812047e-06, "loss": -0.052, "num_tokens": 12054447.0, "reward": 1.1889532804489136, "reward_std": 1.161553144454956, "rewards/rollout_reward_func/mean": 1.1889532804489136, "rewards/rollout_reward_func/std": 1.1615530252456665, "sampling/importance_sampling_ratio/max": 1.3167359828948975, "sampling/importance_sampling_ratio/mean": 0.7837355136871338, "sampling/importance_sampling_ratio/min": 5.605142519016226e-07, "sampling/sampling_logp_difference/max": 2.4700465202331543, "sampling/sampling_logp_difference/mean": 0.2768908143043518, "step": 463, "step_time": 26.630452101002447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.193982221186161, "epoch": 0.00464, "grad_norm": 0.07998504489660263, "kl": 0.6449139332398772, "learning_rate": 7.999932178259451e-06, "loss": -0.0521, "step": 464, "step_time": 13.544028707969119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.09375, "completions/mean_terminated_length": 4.5, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.798979815095663, "epoch": 0.00465, "frac_reward_zero_std": 0.0, "grad_norm": 0.11582102626562119, "kl": 0.37610628083348274, "learning_rate": 7.999931860966393e-06, "loss": -0.0886, "num_tokens": 12112577.0, "reward": 0.42413151264190674, "reward_std": 1.2650744915008545, "rewards/rollout_reward_func/mean": 0.42413151264190674, "rewards/rollout_reward_func/std": 1.2650744915008545, "sampling/importance_sampling_ratio/max": 1.367895483970642, "sampling/importance_sampling_ratio/mean": 0.6371347904205322, "sampling/importance_sampling_ratio/min": 1.9293314835522324e-05, "sampling/sampling_logp_difference/max": 1.8583743572235107, "sampling/sampling_logp_difference/mean": 0.29333287477493286, "step": 465, "step_time": 26.783655282008112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7983856573700905, "epoch": 0.00466, "grad_norm": 0.11196551471948624, "kl": 0.3572063762694597, "learning_rate": 7.999931542932872e-06, "loss": -0.0886, "step": 466, "step_time": 12.927265096979681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.4375, "completions/mean_terminated_length": 5.039999961853027, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9251768374815583, "epoch": 0.00467, "frac_reward_zero_std": 0.0, "grad_norm": 0.06698925048112869, "kl": 0.6433166498318315, "learning_rate": 7.999931224158886e-06, "loss": -0.0793, "num_tokens": 12165924.0, "reward": 0.987528920173645, "reward_std": 1.235754132270813, "rewards/rollout_reward_func/mean": 0.987528920173645, "rewards/rollout_reward_func/std": 1.235754132270813, "sampling/importance_sampling_ratio/max": 1.2295091152191162, "sampling/importance_sampling_ratio/mean": 0.6943091154098511, "sampling/importance_sampling_ratio/min": 1.3740665281147812e-06, "sampling/sampling_logp_difference/max": 1.9507899284362793, "sampling/sampling_logp_difference/mean": 0.3991580903530121, "step": 467, "step_time": 30.809043968009064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.921856064349413, "epoch": 0.00468, "grad_norm": 0.06653490662574768, "kl": 0.5888725705444813, "learning_rate": 7.999930904644442e-06, "loss": -0.0794, "step": 468, "step_time": 15.549144725009683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 6.1875, "completions/mean_terminated_length": 3.92307710647583, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.4153745230287313, "epoch": 0.00469, "frac_reward_zero_std": 0.0, "grad_norm": 0.5172695517539978, "kl": 0.4121108800172806, "learning_rate": 7.999930584389531e-06, "loss": -0.0565, "num_tokens": 12217973.0, "reward": 0.4754069745540619, "reward_std": 1.2498140335083008, "rewards/rollout_reward_func/mean": 0.4754069745540619, "rewards/rollout_reward_func/std": 1.2498139142990112, "sampling/importance_sampling_ratio/max": 1.442152976989746, "sampling/importance_sampling_ratio/mean": 0.8686262369155884, "sampling/importance_sampling_ratio/min": 4.4436976054385013e-07, "sampling/sampling_logp_difference/max": 1.9826046228408813, "sampling/sampling_logp_difference/mean": 0.2957240343093872, "step": 469, "step_time": 27.899645821016748 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.02083333395421505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02864583395421505, "entropy": 1.4131847023963928, "epoch": 0.0047, "grad_norm": 0.11300970613956451, "kl": 0.4261714555323124, "learning_rate": 7.999930263394161e-06, "loss": -0.0581, "step": 470, "step_time": 12.415778659997159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.96875, "completions/mean_terminated_length": 4.111111164093018, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0369449201971292, "epoch": 0.00471, "frac_reward_zero_std": 0.25, "grad_norm": 0.08133698254823685, "kl": 0.5389839624986053, "learning_rate": 7.999929941658327e-06, "loss": -0.057, "num_tokens": 12269910.0, "reward": 0.7669495344161987, "reward_std": 1.1844732761383057, "rewards/rollout_reward_func/mean": 0.7669495344161987, "rewards/rollout_reward_func/std": 1.1844732761383057, "sampling/importance_sampling_ratio/max": 1.4956231117248535, "sampling/importance_sampling_ratio/mean": 0.9147366285324097, "sampling/importance_sampling_ratio/min": 4.111065936740488e-05, "sampling/sampling_logp_difference/max": 2.034562587738037, "sampling/sampling_logp_difference/mean": 0.21933630108833313, "step": 471, "step_time": 25.612296635023085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0349289504811168, "epoch": 0.00472, "grad_norm": 0.08344987034797668, "kl": 0.5337476283311844, "learning_rate": 7.999929619182034e-06, "loss": -0.0567, "step": 472, "step_time": 12.807180178977433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.8125, "completions/mean_terminated_length": 4.758620738983154, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1108290292322636, "epoch": 0.00473, "frac_reward_zero_std": 0.0, "grad_norm": 0.17016172409057617, "kl": 0.43404428102076054, "learning_rate": 7.999929295965276e-06, "loss": -0.0615, "num_tokens": 12329186.0, "reward": 1.0551027059555054, "reward_std": 1.2541471719741821, "rewards/rollout_reward_func/mean": 1.0551027059555054, "rewards/rollout_reward_func/std": 1.2541471719741821, "sampling/importance_sampling_ratio/max": 1.4290049076080322, "sampling/importance_sampling_ratio/mean": 0.8679360151290894, "sampling/importance_sampling_ratio/min": 5.511014933290426e-06, "sampling/sampling_logp_difference/max": 2.3257603645324707, "sampling/sampling_logp_difference/mean": 0.25111114978790283, "step": 473, "step_time": 30.83069151797099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1169802360236645, "epoch": 0.00474, "grad_norm": 0.18450917303562164, "kl": 0.4306026007980108, "learning_rate": 7.999928972008055e-06, "loss": -0.0621, "step": 474, "step_time": 16.568827498020255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 6.53125, "completions/mean_terminated_length": 4.34615421295166, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4334229286760092, "epoch": 0.00475, "frac_reward_zero_std": 0.25, "grad_norm": 0.10671287029981613, "kl": 0.5972630456089973, "learning_rate": 7.999928647310374e-06, "loss": -0.0455, "num_tokens": 12388328.0, "reward": 0.8299505114555359, "reward_std": 1.2488964796066284, "rewards/rollout_reward_func/mean": 0.8299505114555359, "rewards/rollout_reward_func/std": 1.248896598815918, "sampling/importance_sampling_ratio/max": 1.3196228742599487, "sampling/importance_sampling_ratio/mean": 0.778179407119751, "sampling/importance_sampling_ratio/min": 2.9255716071929783e-08, "sampling/sampling_logp_difference/max": 2.640761375427246, "sampling/sampling_logp_difference/mean": 0.3520989418029785, "step": 475, "step_time": 31.886071765009547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4515676125884056, "epoch": 0.00476, "grad_norm": 0.11464396864175797, "kl": 0.5994037464261055, "learning_rate": 7.99992832187223e-06, "loss": -0.0455, "step": 476, "step_time": 15.718643021027674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 7.9375, "completions/mean_terminated_length": 4.2727274894714355, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.402913162484765, "epoch": 0.00477, "frac_reward_zero_std": 0.25, "grad_norm": 0.05069776996970177, "kl": 0.2706878278404474, "learning_rate": 7.999927995693626e-06, "loss": -0.0598, "num_tokens": 12441521.0, "reward": 0.9154433012008667, "reward_std": 1.1946449279785156, "rewards/rollout_reward_func/mean": 0.9154433012008667, "rewards/rollout_reward_func/std": 1.1946449279785156, "sampling/importance_sampling_ratio/max": 1.8356740474700928, "sampling/importance_sampling_ratio/mean": 0.7939757108688354, "sampling/importance_sampling_ratio/min": 3.293573172413744e-05, "sampling/sampling_logp_difference/max": 1.3368699550628662, "sampling/sampling_logp_difference/mean": 0.2564294934272766, "step": 477, "step_time": 30.098733079008525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4095552004873753, "epoch": 0.00478, "grad_norm": 0.05272408947348595, "kl": 0.2669047322124243, "learning_rate": 7.999927668774559e-06, "loss": -0.0598, "step": 478, "step_time": 13.938759613985894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.625, "completions/mean_terminated_length": 5.2727274894714355, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9710360057651997, "epoch": 0.00479, "frac_reward_zero_std": 0.0, "grad_norm": 0.10420364886522293, "kl": 0.29480968229472637, "learning_rate": 7.99992734111503e-06, "loss": -0.0885, "num_tokens": 12503425.0, "reward": 0.4308859705924988, "reward_std": 1.248565912246704, "rewards/rollout_reward_func/mean": 0.4308859705924988, "rewards/rollout_reward_func/std": 1.248565912246704, "sampling/importance_sampling_ratio/max": 1.6503615379333496, "sampling/importance_sampling_ratio/mean": 0.6285229325294495, "sampling/importance_sampling_ratio/min": 5.880324351892341e-06, "sampling/sampling_logp_difference/max": 1.9557814598083496, "sampling/sampling_logp_difference/mean": 0.40511709451675415, "step": 479, "step_time": 35.12866819999181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9817873015999794, "epoch": 0.0048, "grad_norm": 0.10171297937631607, "kl": 0.299719444476068, "learning_rate": 7.99992701271504e-06, "loss": -0.0888, "step": 480, "step_time": 16.80589309698553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.6875, "completions/mean_terminated_length": 5.769230842590332, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8792079165577888, "epoch": 0.00481, "frac_reward_zero_std": 0.0, "grad_norm": 0.07938799262046814, "kl": 0.54917380400002, "learning_rate": 7.99992668357459e-06, "loss": -0.0977, "num_tokens": 12556283.0, "reward": 0.46629735827445984, "reward_std": 1.4127329587936401, "rewards/rollout_reward_func/mean": 0.46629735827445984, "rewards/rollout_reward_func/std": 1.4127329587936401, "sampling/importance_sampling_ratio/max": 1.4791808128356934, "sampling/importance_sampling_ratio/mean": 0.6637833118438721, "sampling/importance_sampling_ratio/min": 1.6524501234016498e-06, "sampling/sampling_logp_difference/max": 2.4286465644836426, "sampling/sampling_logp_difference/mean": 0.36127081513404846, "step": 481, "step_time": 30.495587893979973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8835265785455704, "epoch": 0.00482, "grad_norm": 0.07959035038948059, "kl": 0.5542021868750453, "learning_rate": 7.999926353693675e-06, "loss": -0.0977, "step": 482, "step_time": 15.243006117016193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.90625, "completions/mean_terminated_length": 5.736842155456543, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.251728504896164, "epoch": 0.00483, "frac_reward_zero_std": 0.0, "grad_norm": 0.05150742456316948, "kl": 0.248395511880517, "learning_rate": 7.999926023072302e-06, "loss": -0.0888, "num_tokens": 12610821.0, "reward": 0.2978900969028473, "reward_std": 1.3341960906982422, "rewards/rollout_reward_func/mean": 0.2978900969028473, "rewards/rollout_reward_func/std": 1.3341959714889526, "sampling/importance_sampling_ratio/max": 1.3598120212554932, "sampling/importance_sampling_ratio/mean": 0.5469174385070801, "sampling/importance_sampling_ratio/min": 7.869021828810219e-06, "sampling/sampling_logp_difference/max": 2.0731334686279297, "sampling/sampling_logp_difference/mean": 0.38325318694114685, "step": 483, "step_time": 29.640134576984565 }, { "clip_ratio/high_max": 0.004464285913854837, "clip_ratio/high_mean": 0.0022321429569274187, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0022321429569274187, "entropy": 2.253010079264641, "epoch": 0.00484, "grad_norm": 0.05253363028168678, "kl": 0.2567704669199884, "learning_rate": 7.999925691710467e-06, "loss": -0.089, "step": 484, "step_time": 13.653356618990074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.875, "completions/mean_terminated_length": 5.5714287757873535, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1775395069271326, "epoch": 0.00485, "frac_reward_zero_std": 0.0, "grad_norm": 0.12895269691944122, "kl": 0.6135572176426649, "learning_rate": 7.99992535960817e-06, "loss": -0.0933, "num_tokens": 12659417.0, "reward": 1.0411546230316162, "reward_std": 1.1652318239212036, "rewards/rollout_reward_func/mean": 1.0411546230316162, "rewards/rollout_reward_func/std": 1.1652318239212036, "sampling/importance_sampling_ratio/max": 1.3161344528198242, "sampling/importance_sampling_ratio/mean": 0.8135663270950317, "sampling/importance_sampling_ratio/min": 0.00031735107768327, "sampling/sampling_logp_difference/max": 2.184359312057495, "sampling/sampling_logp_difference/mean": 0.2577192187309265, "step": 485, "step_time": 24.553168159996858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1713114362210035, "epoch": 0.00486, "grad_norm": 0.12718971073627472, "kl": 0.6241841204464436, "learning_rate": 7.999925026765412e-06, "loss": -0.0937, "step": 486, "step_time": 12.505953985004453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.40625, "completions/mean_terminated_length": 5.4347825050354, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.0060219429433346, "epoch": 0.00487, "frac_reward_zero_std": 0.0, "grad_norm": 0.12861019372940063, "kl": 0.23739892337471247, "learning_rate": 7.999924693182194e-06, "loss": -0.1032, "num_tokens": 12715565.0, "reward": 0.4457208514213562, "reward_std": 1.2689976692199707, "rewards/rollout_reward_func/mean": 0.4457208514213562, "rewards/rollout_reward_func/std": 1.2689976692199707, "sampling/importance_sampling_ratio/max": 1.2884533405303955, "sampling/importance_sampling_ratio/mean": 0.6591289043426514, "sampling/importance_sampling_ratio/min": 1.1639489372328171e-07, "sampling/sampling_logp_difference/max": 1.9366438388824463, "sampling/sampling_logp_difference/mean": 0.39179617166519165, "step": 487, "step_time": 28.089350747002754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9952906128019094, "epoch": 0.00488, "grad_norm": 0.12474521994590759, "kl": 0.24197862530127168, "learning_rate": 7.999924358858514e-06, "loss": -0.1038, "step": 488, "step_time": 12.473699072026648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.375, "completions/mean_terminated_length": 5.833333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.032694712281227, "epoch": 0.00489, "frac_reward_zero_std": 0.0, "grad_norm": 0.16488730907440186, "kl": 0.7282243054360151, "learning_rate": 7.999924023794374e-06, "loss": -0.0913, "num_tokens": 12766391.0, "reward": 0.5721263885498047, "reward_std": 1.3351372480392456, "rewards/rollout_reward_func/mean": 0.5721263885498047, "rewards/rollout_reward_func/std": 1.3351372480392456, "sampling/importance_sampling_ratio/max": 1.3190399408340454, "sampling/importance_sampling_ratio/mean": 0.6052893400192261, "sampling/importance_sampling_ratio/min": 1.065874698724656e-06, "sampling/sampling_logp_difference/max": 2.1971707344055176, "sampling/sampling_logp_difference/mean": 0.4191686511039734, "step": 489, "step_time": 34.66247989796102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0281698368489742, "epoch": 0.0049, "grad_norm": 0.17724399268627167, "kl": 0.7729284781962633, "learning_rate": 7.999923687989774e-06, "loss": -0.0914, "step": 490, "step_time": 15.425521106008091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 7.03125, "completions/mean_terminated_length": 4.519999980926514, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2843827456235886, "epoch": 0.00491, "frac_reward_zero_std": 0.0, "grad_norm": 0.050778985023498535, "kl": 0.37447680346667767, "learning_rate": 7.999923351444713e-06, "loss": -0.0761, "num_tokens": 12814066.0, "reward": 0.5796388387680054, "reward_std": 1.1994818449020386, "rewards/rollout_reward_func/mean": 0.5796388387680054, "rewards/rollout_reward_func/std": 1.1994818449020386, "sampling/importance_sampling_ratio/max": 1.3633772134780884, "sampling/importance_sampling_ratio/mean": 0.8313030004501343, "sampling/importance_sampling_ratio/min": 1.2614314073289279e-06, "sampling/sampling_logp_difference/max": 2.115401029586792, "sampling/sampling_logp_difference/mean": 0.28944092988967896, "step": 491, "step_time": 25.47544471397123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.270990327000618, "epoch": 0.00492, "grad_norm": 0.0470062680542469, "kl": 0.39600549824535847, "learning_rate": 7.99992301415919e-06, "loss": -0.0762, "step": 492, "step_time": 12.349952970980667 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01171875, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 6.96875, "completions/mean_terminated_length": 4.884615421295166, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2741910051554441, "epoch": 0.00493, "frac_reward_zero_std": 0.0, "grad_norm": 0.05769284814596176, "kl": 0.554922042414546, "learning_rate": 7.999922676133208e-06, "loss": -0.0788, "num_tokens": 12867204.0, "reward": 0.6986276507377625, "reward_std": 1.382422924041748, "rewards/rollout_reward_func/mean": 0.6986276507377625, "rewards/rollout_reward_func/std": 1.382422924041748, "sampling/importance_sampling_ratio/max": 1.3019375801086426, "sampling/importance_sampling_ratio/mean": 0.7209979295730591, "sampling/importance_sampling_ratio/min": 7.290871872100979e-05, "sampling/sampling_logp_difference/max": 1.9480690956115723, "sampling/sampling_logp_difference/mean": 0.28382647037506104, "step": 493, "step_time": 25.198642357005156 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "entropy": 1.272998221218586, "epoch": 0.00494, "grad_norm": 0.059068456292152405, "kl": 0.513229688629508, "learning_rate": 7.999922337366765e-06, "loss": -0.0787, "step": 494, "step_time": 12.304023141026846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.875, "completions/mean_terminated_length": 4.769230842590332, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4488475695252419, "epoch": 0.00495, "frac_reward_zero_std": 0.0, "grad_norm": 0.23848417401313782, "kl": 0.3576636463403702, "learning_rate": 7.99992199785986e-06, "loss": -0.0614, "num_tokens": 12923759.0, "reward": 0.7461943030357361, "reward_std": 1.264309048652649, "rewards/rollout_reward_func/mean": 0.7461943030357361, "rewards/rollout_reward_func/std": 1.264309048652649, "sampling/importance_sampling_ratio/max": 1.617476224899292, "sampling/importance_sampling_ratio/mean": 0.842329740524292, "sampling/importance_sampling_ratio/min": 1.4258899909691536e-06, "sampling/sampling_logp_difference/max": 2.2658607959747314, "sampling/sampling_logp_difference/mean": 0.29965513944625854, "step": 495, "step_time": 27.046729757988942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.446828844025731, "epoch": 0.00496, "grad_norm": 0.25933006405830383, "kl": 0.347070325165987, "learning_rate": 7.999921657612498e-06, "loss": -0.0619, "step": 496, "step_time": 14.30678079399513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 6.3125, "completions/mean_terminated_length": 4.518518447875977, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4812230970710516, "epoch": 0.00497, "frac_reward_zero_std": 0.5, "grad_norm": 0.03249356895685196, "kl": 0.40396819822490215, "learning_rate": 7.999921316624673e-06, "loss": -0.0448, "num_tokens": 12968132.0, "reward": 0.6092314124107361, "reward_std": 1.2157655954360962, "rewards/rollout_reward_func/mean": 0.6092314124107361, "rewards/rollout_reward_func/std": 1.2157655954360962, "sampling/importance_sampling_ratio/max": 1.1968348026275635, "sampling/importance_sampling_ratio/mean": 0.8010474443435669, "sampling/importance_sampling_ratio/min": 3.754843191927648e-06, "sampling/sampling_logp_difference/max": 1.9695611000061035, "sampling/sampling_logp_difference/mean": 0.33474382758140564, "step": 497, "step_time": 22.69042100101069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4789257552474737, "epoch": 0.00498, "grad_norm": 0.030614174902439117, "kl": 0.393295731395483, "learning_rate": 7.99992097489639e-06, "loss": -0.0448, "step": 498, "step_time": 11.803538213993306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 8.65625, "completions/mean_terminated_length": 5.782608985900879, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.279713496565819, "epoch": 0.00499, "frac_reward_zero_std": 0.0, "grad_norm": 0.05462343245744705, "kl": 0.21192754432559013, "learning_rate": 7.999920632427647e-06, "loss": -0.0926, "num_tokens": 13027359.0, "reward": 0.2976585924625397, "reward_std": 1.146488904953003, "rewards/rollout_reward_func/mean": 0.2976585924625397, "rewards/rollout_reward_func/std": 1.146488904953003, "sampling/importance_sampling_ratio/max": 1.4665032625198364, "sampling/importance_sampling_ratio/mean": 0.6464537978172302, "sampling/importance_sampling_ratio/min": 9.00653731150669e-07, "sampling/sampling_logp_difference/max": 1.899702548980713, "sampling/sampling_logp_difference/mean": 0.3898331820964813, "step": 499, "step_time": 27.80687095101166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.278457447886467, "epoch": 0.005, "grad_norm": 0.05247347801923752, "kl": 0.21047526504844427, "learning_rate": 7.999920289218444e-06, "loss": -0.0925, "step": 500, "step_time": 12.3608703520149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 9.3125, "completions/mean_terminated_length": 7.083333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.126915618777275, "epoch": 0.00501, "frac_reward_zero_std": 0.0, "grad_norm": 0.11372290551662445, "kl": 0.271783790551126, "learning_rate": 7.999919945268779e-06, "loss": -0.0672, "num_tokens": 13078027.0, "reward": 0.30260157585144043, "reward_std": 1.2091172933578491, "rewards/rollout_reward_func/mean": 0.30260157585144043, "rewards/rollout_reward_func/std": 1.2091172933578491, "sampling/importance_sampling_ratio/max": 1.5325733423233032, "sampling/importance_sampling_ratio/mean": 0.56316739320755, "sampling/importance_sampling_ratio/min": 2.127929747075541e-06, "sampling/sampling_logp_difference/max": 1.6777561902999878, "sampling/sampling_logp_difference/mean": 0.33679530024528503, "step": 501, "step_time": 32.65602565801237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.1251509934663773, "epoch": 0.00502, "grad_norm": 0.11272155493497849, "kl": 0.2766137011349201, "learning_rate": 7.999919600578657e-06, "loss": -0.067, "step": 502, "step_time": 13.713290630024858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 7.28125, "completions/mean_terminated_length": 4.839999675750732, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7138884030282497, "epoch": 0.00503, "frac_reward_zero_std": 0.0, "grad_norm": 0.048362571746110916, "kl": 0.63407470472157, "learning_rate": 7.999919255148074e-06, "loss": -0.0876, "num_tokens": 13131407.0, "reward": 0.8803229928016663, "reward_std": 1.271622896194458, "rewards/rollout_reward_func/mean": 0.8803229928016663, "rewards/rollout_reward_func/std": 1.271622896194458, "sampling/importance_sampling_ratio/max": 1.583053469657898, "sampling/importance_sampling_ratio/mean": 0.7405819892883301, "sampling/importance_sampling_ratio/min": 1.0572599421720952e-05, "sampling/sampling_logp_difference/max": 1.8145599365234375, "sampling/sampling_logp_difference/mean": 0.34075796604156494, "step": 503, "step_time": 26.6994840749976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7123240120708942, "epoch": 0.00504, "grad_norm": 0.04971511289477348, "kl": 0.6494535375386477, "learning_rate": 7.999918908977031e-06, "loss": -0.0875, "step": 504, "step_time": 12.391793746995972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.53125, "completions/mean_terminated_length": 5.136363983154297, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.9738022349774837, "epoch": 0.00505, "frac_reward_zero_std": 0.0, "grad_norm": 0.21687312424182892, "kl": 0.3019862286746502, "learning_rate": 7.999918562065531e-06, "loss": -0.0534, "num_tokens": 13189702.0, "reward": -0.19594983756542206, "reward_std": 1.0739535093307495, "rewards/rollout_reward_func/mean": -0.19594983756542206, "rewards/rollout_reward_func/std": 1.07395339012146, "sampling/importance_sampling_ratio/max": 1.7129062414169312, "sampling/importance_sampling_ratio/mean": 0.7285699844360352, "sampling/importance_sampling_ratio/min": 7.089738573995419e-08, "sampling/sampling_logp_difference/max": 2.6853837966918945, "sampling/sampling_logp_difference/mean": 0.35433822870254517, "step": 505, "step_time": 27.997403547997237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.975604336708784, "epoch": 0.00506, "grad_norm": 0.23462487757205963, "kl": 0.29978183889761567, "learning_rate": 7.999918214413569e-06, "loss": -0.0542, "step": 506, "step_time": 13.182584276990383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.125, "completions/mean_terminated_length": 5.523809432983398, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.2386868447065353, "epoch": 0.00507, "frac_reward_zero_std": 0.0, "grad_norm": 0.458170086145401, "kl": 0.6899164011701941, "learning_rate": 7.999917866021148e-06, "loss": -0.0798, "num_tokens": 13247622.0, "reward": 0.4878377914428711, "reward_std": 1.3442586660385132, "rewards/rollout_reward_func/mean": 0.4878377914428711, "rewards/rollout_reward_func/std": 1.3442586660385132, "sampling/importance_sampling_ratio/max": 1.4832707643508911, "sampling/importance_sampling_ratio/mean": 0.547415554523468, "sampling/importance_sampling_ratio/min": 1.1196593732165638e-06, "sampling/sampling_logp_difference/max": 2.1062045097351074, "sampling/sampling_logp_difference/mean": 0.3613595962524414, "step": 507, "step_time": 28.439205455986666 }, { "clip_ratio/high_max": 0.028125000186264515, "clip_ratio/high_mean": 0.014062500093132257, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014062500093132257, "entropy": 2.2386010140180588, "epoch": 0.00508, "grad_norm": 0.12481795996427536, "kl": 0.6497786114923656, "learning_rate": 7.999917516888269e-06, "loss": -0.0818, "step": 508, "step_time": 12.65879612902063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 6.90625, "completions/mean_terminated_length": 4.360000133514404, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.8145686462521553, "epoch": 0.00509, "frac_reward_zero_std": 0.25, "grad_norm": 0.06078348681330681, "kl": 0.31348327547311783, "learning_rate": 7.99991716701493e-06, "loss": -0.0459, "num_tokens": 13297050.0, "reward": -0.38612180948257446, "reward_std": 0.9544216990470886, "rewards/rollout_reward_func/mean": -0.38612180948257446, "rewards/rollout_reward_func/std": 0.9544216990470886, "sampling/importance_sampling_ratio/max": 1.258873701095581, "sampling/importance_sampling_ratio/mean": 0.6935657262802124, "sampling/importance_sampling_ratio/min": 1.5250496687713166e-07, "sampling/sampling_logp_difference/max": 2.0290367603302, "sampling/sampling_logp_difference/mean": 0.3493274450302124, "step": 509, "step_time": 27.965325644006953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8148570470511913, "epoch": 0.0051, "grad_norm": 0.05670241639018059, "kl": 0.3304338436573744, "learning_rate": 7.999916816401132e-06, "loss": -0.0459, "step": 510, "step_time": 13.36255251399416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 5.333333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.0374589264392853, "epoch": 0.00511, "frac_reward_zero_std": 0.0, "grad_norm": 0.14635810256004333, "kl": 0.30204179137945175, "learning_rate": 7.999916465046877e-06, "loss": -0.0651, "num_tokens": 13350614.0, "reward": 0.21074441075325012, "reward_std": 1.274451494216919, "rewards/rollout_reward_func/mean": 0.21074441075325012, "rewards/rollout_reward_func/std": 1.274451494216919, "sampling/importance_sampling_ratio/max": 1.5072764158248901, "sampling/importance_sampling_ratio/mean": 0.5637044906616211, "sampling/importance_sampling_ratio/min": 1.2833426410452375e-07, "sampling/sampling_logp_difference/max": 2.0853123664855957, "sampling/sampling_logp_difference/mean": 0.4248085021972656, "step": 511, "step_time": 26.98152898401895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0346974059939384, "epoch": 0.00512, "grad_norm": 0.1470436453819275, "kl": 0.2909331419505179, "learning_rate": 7.99991611295216e-06, "loss": -0.0656, "step": 512, "step_time": 13.290334439006983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.5, "completions/mean_terminated_length": 4.6666669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9459962025284767, "epoch": 0.00513, "frac_reward_zero_std": 0.0, "grad_norm": 0.04819868505001068, "kl": 0.6906452719122171, "learning_rate": 7.999915760116986e-06, "loss": -0.0917, "num_tokens": 13405376.0, "reward": 0.7877195477485657, "reward_std": 1.1354702711105347, "rewards/rollout_reward_func/mean": 0.7877195477485657, "rewards/rollout_reward_func/std": 1.1354702711105347, "sampling/importance_sampling_ratio/max": 1.4903709888458252, "sampling/importance_sampling_ratio/mean": 0.7932724356651306, "sampling/importance_sampling_ratio/min": 6.980970397307829e-07, "sampling/sampling_logp_difference/max": 2.4373745918273926, "sampling/sampling_logp_difference/mean": 0.3589327931404114, "step": 513, "step_time": 26.089028267990216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9476852789521217, "epoch": 0.00514, "grad_norm": 0.047729700803756714, "kl": 0.6757498588413, "learning_rate": 7.999915406541353e-06, "loss": -0.0919, "step": 514, "step_time": 11.781344043993158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 6.09375, "completions/mean_terminated_length": 4.259259223937988, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2575864847749472, "epoch": 0.00515, "frac_reward_zero_std": 0.0, "grad_norm": 0.21591390669345856, "kl": 0.5187007002532482, "learning_rate": 7.999915052225262e-06, "loss": -0.089, "num_tokens": 13458653.0, "reward": 0.8843910098075867, "reward_std": 1.1224170923233032, "rewards/rollout_reward_func/mean": 0.8843910098075867, "rewards/rollout_reward_func/std": 1.1224170923233032, "sampling/importance_sampling_ratio/max": 1.37363862991333, "sampling/importance_sampling_ratio/mean": 0.8449444770812988, "sampling/importance_sampling_ratio/min": 3.4363508802925935e-06, "sampling/sampling_logp_difference/max": 1.5563960075378418, "sampling/sampling_logp_difference/mean": 0.26642513275146484, "step": 515, "step_time": 26.40240085600817 }, { "clip_ratio/high_max": 0.057291666977107525, "clip_ratio/high_mean": 0.028645833488553762, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.028645833488553762, "entropy": 1.2502977419644594, "epoch": 0.00516, "grad_norm": 0.13428613543510437, "kl": 0.5467937793582678, "learning_rate": 7.999914697168712e-06, "loss": -0.0902, "step": 516, "step_time": 12.505384066986153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 4.875, "completions/mean_terminated_length": 4.516129016876221, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.5590096758678555, "epoch": 0.00517, "frac_reward_zero_std": 0.0, "grad_norm": 0.07953689247369766, "kl": 1.6224016100168228, "learning_rate": 7.999914341371702e-06, "loss": -0.0528, "num_tokens": 13498737.0, "reward": 1.6475183963775635, "reward_std": 0.7959722280502319, "rewards/rollout_reward_func/mean": 1.6475183963775635, "rewards/rollout_reward_func/std": 0.7959722280502319, "sampling/importance_sampling_ratio/max": 1.208069086074829, "sampling/importance_sampling_ratio/mean": 0.9284247159957886, "sampling/importance_sampling_ratio/min": 0.00047076554619707167, "sampling/sampling_logp_difference/max": 1.8456213474273682, "sampling/sampling_logp_difference/mean": 0.1603870689868927, "step": 517, "step_time": 18.09216668701265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5600048005580902, "epoch": 0.00518, "grad_norm": 0.07049862295389175, "kl": 1.4965593554079533, "learning_rate": 7.999913984834237e-06, "loss": -0.0532, "step": 518, "step_time": 9.795149538986152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.091541824862361, "epoch": 0.00519, "frac_reward_zero_std": 0.0, "grad_norm": 0.0708044022321701, "kl": 0.2384609915316105, "learning_rate": 7.99991362755631e-06, "loss": -0.0544, "num_tokens": 13552012.0, "reward": 0.5535950064659119, "reward_std": 1.334583044052124, "rewards/rollout_reward_func/mean": 0.5535950064659119, "rewards/rollout_reward_func/std": 1.334583044052124, "sampling/importance_sampling_ratio/max": 1.8840839862823486, "sampling/importance_sampling_ratio/mean": 0.9654867053031921, "sampling/importance_sampling_ratio/min": 1.1410379556764383e-05, "sampling/sampling_logp_difference/max": 2.182556390762329, "sampling/sampling_logp_difference/mean": 0.2360442876815796, "step": 519, "step_time": 27.060592754991376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0904323067516088, "epoch": 0.0052, "grad_norm": 0.06356062740087509, "kl": 0.24092890694737434, "learning_rate": 7.999913269537927e-06, "loss": -0.0542, "step": 520, "step_time": 13.100917900024797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.875, "completions/mean_terminated_length": 4.8275861740112305, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.3275527488440275, "epoch": 0.00521, "frac_reward_zero_std": 0.0, "grad_norm": 0.13546358048915863, "kl": 0.41946721263229847, "learning_rate": 7.999912910779086e-06, "loss": -0.0814, "num_tokens": 13605691.0, "reward": -0.012918442487716675, "reward_std": 1.1511839628219604, "rewards/rollout_reward_func/mean": -0.012918442487716675, "rewards/rollout_reward_func/std": 1.1511839628219604, "sampling/importance_sampling_ratio/max": 1.558212399482727, "sampling/importance_sampling_ratio/mean": 0.8385034799575806, "sampling/importance_sampling_ratio/min": 8.15228531791945e-07, "sampling/sampling_logp_difference/max": 2.855161190032959, "sampling/sampling_logp_difference/mean": 0.3260275721549988, "step": 521, "step_time": 25.131463205980253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 1.335614113137126, "epoch": 0.00522, "grad_norm": 0.08918300271034241, "kl": 0.44205143488943577, "learning_rate": 7.999912551279787e-06, "loss": -0.0816, "step": 522, "step_time": 13.98780513102247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 7.15625, "completions/mean_terminated_length": 4.679999828338623, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7735847923904657, "epoch": 0.00523, "frac_reward_zero_std": 0.25, "grad_norm": 0.05344495549798012, "kl": 0.49513990432024, "learning_rate": 7.99991219104003e-06, "loss": -0.0776, "num_tokens": 13658865.0, "reward": 0.8122799396514893, "reward_std": 1.2557971477508545, "rewards/rollout_reward_func/mean": 0.8122799396514893, "rewards/rollout_reward_func/std": 1.2557971477508545, "sampling/importance_sampling_ratio/max": 1.4558049440383911, "sampling/importance_sampling_ratio/mean": 0.7928494215011597, "sampling/importance_sampling_ratio/min": 8.967291798001042e-09, "sampling/sampling_logp_difference/max": 2.2766261100769043, "sampling/sampling_logp_difference/mean": 0.39036867022514343, "step": 523, "step_time": 25.54944021400297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7768719270825386, "epoch": 0.00524, "grad_norm": 0.05280565470457077, "kl": 0.4834983628243208, "learning_rate": 7.999911830059816e-06, "loss": -0.0777, "step": 524, "step_time": 12.12530502100708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.46875, "completions/mean_terminated_length": 4.379310131072998, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.7791079059243202, "epoch": 0.00525, "frac_reward_zero_std": 0.25, "grad_norm": 0.03392122685909271, "kl": 0.32610384933650494, "learning_rate": 7.999911468339143e-06, "loss": -0.0517, "num_tokens": 13701378.0, "reward": 0.9808173775672913, "reward_std": 1.3516491651535034, "rewards/rollout_reward_func/mean": 0.9808173775672913, "rewards/rollout_reward_func/std": 1.3516491651535034, "sampling/importance_sampling_ratio/max": 1.1778596639633179, "sampling/importance_sampling_ratio/mean": 0.922625720500946, "sampling/importance_sampling_ratio/min": 0.000922827166505158, "sampling/sampling_logp_difference/max": 1.409964919090271, "sampling/sampling_logp_difference/mean": 0.15635472536087036, "step": 525, "step_time": 23.497074821032584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7800920139998198, "epoch": 0.00526, "grad_norm": 0.039171911776065826, "kl": 0.3327486217021942, "learning_rate": 7.999911105878013e-06, "loss": -0.0518, "step": 526, "step_time": 13.123302143008914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 4.625, "completions/mean_terminated_length": 4.258064270019531, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.6528336424380541, "epoch": 0.00527, "frac_reward_zero_std": 0.5, "grad_norm": 0.0883951261639595, "kl": 0.8828487992286682, "learning_rate": 7.999910742676423e-06, "loss": -0.0119, "num_tokens": 13739020.0, "reward": 0.820752739906311, "reward_std": 1.376532793045044, "rewards/rollout_reward_func/mean": 0.820752739906311, "rewards/rollout_reward_func/std": 1.3765326738357544, "sampling/importance_sampling_ratio/max": 1.2226073741912842, "sampling/importance_sampling_ratio/mean": 0.9687589406967163, "sampling/importance_sampling_ratio/min": 7.569635272375308e-06, "sampling/sampling_logp_difference/max": 1.7662955522537231, "sampling/sampling_logp_difference/mean": 0.17394843697547913, "step": 527, "step_time": 19.85372523998376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6580741284415126, "epoch": 0.00528, "grad_norm": 0.095266193151474, "kl": 0.86391456797719, "learning_rate": 7.999910378734379e-06, "loss": -0.0122, "step": 528, "step_time": 11.222112230985658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.53125, "completions/mean_terminated_length": 5.159999847412109, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.571523554623127, "epoch": 0.00529, "frac_reward_zero_std": 0.0, "grad_norm": 0.1786564588546753, "kl": 1.0557934828102589, "learning_rate": 7.999910014051875e-06, "loss": -0.0523, "num_tokens": 13799973.0, "reward": 0.0832723081111908, "reward_std": 1.15095055103302, "rewards/rollout_reward_func/mean": 0.0832723081111908, "rewards/rollout_reward_func/std": 1.1509504318237305, "sampling/importance_sampling_ratio/max": 1.4073041677474976, "sampling/importance_sampling_ratio/mean": 0.6691641211509705, "sampling/importance_sampling_ratio/min": 1.4376361832546536e-05, "sampling/sampling_logp_difference/max": 2.5616695880889893, "sampling/sampling_logp_difference/mean": 0.3258548676967621, "step": 529, "step_time": 27.525749961991096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.5822682427242398, "epoch": 0.0053, "grad_norm": 0.12860827147960663, "kl": 0.968542717397213, "learning_rate": 7.999909648628916e-06, "loss": -0.0533, "step": 530, "step_time": 13.471733401995152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.8125, "completions/mean_terminated_length": 4.692307949066162, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2907453896477818, "epoch": 0.00531, "frac_reward_zero_std": 0.0, "grad_norm": 0.04844352975487709, "kl": 0.4724350571632385, "learning_rate": 7.999909282465499e-06, "loss": -0.0832, "num_tokens": 13851560.0, "reward": 1.3726537227630615, "reward_std": 1.110917568206787, "rewards/rollout_reward_func/mean": 1.3726537227630615, "rewards/rollout_reward_func/std": 1.1109174489974976, "sampling/importance_sampling_ratio/max": 1.5036282539367676, "sampling/importance_sampling_ratio/mean": 0.8148916959762573, "sampling/importance_sampling_ratio/min": 1.464952532614916e-07, "sampling/sampling_logp_difference/max": 2.2389519214630127, "sampling/sampling_logp_difference/mean": 0.2802264094352722, "step": 531, "step_time": 25.102309665002394 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0034722222480922937, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034722222480922937, "entropy": 1.3063922161236405, "epoch": 0.00532, "grad_norm": 0.05314183607697487, "kl": 0.4234025436453521, "learning_rate": 7.999908915561626e-06, "loss": -0.0831, "step": 532, "step_time": 12.16363751697645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.3125, "completions/mean_terminated_length": 4.076923370361328, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.089488472789526, "epoch": 0.00533, "frac_reward_zero_std": 0.0, "grad_norm": 0.07098682224750519, "kl": 0.39512544497847557, "learning_rate": 7.999908547917295e-06, "loss": -0.0682, "num_tokens": 13908664.0, "reward": 0.8651026487350464, "reward_std": 1.231921672821045, "rewards/rollout_reward_func/mean": 0.8651026487350464, "rewards/rollout_reward_func/std": 1.231921672821045, "sampling/importance_sampling_ratio/max": 1.4971823692321777, "sampling/importance_sampling_ratio/mean": 0.8744548559188843, "sampling/importance_sampling_ratio/min": 0.00011857796926051378, "sampling/sampling_logp_difference/max": 1.5741380453109741, "sampling/sampling_logp_difference/mean": 0.21740931272506714, "step": 533, "step_time": 32.090021876007086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.098498860374093, "epoch": 0.00534, "grad_norm": 0.0690801739692688, "kl": 0.4012566125020385, "learning_rate": 7.999908179532507e-06, "loss": -0.0684, "step": 534, "step_time": 15.386213359000976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.90625, "completions/mean_terminated_length": 5.222222328186035, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.528145533055067, "epoch": 0.00535, "frac_reward_zero_std": 0.0, "grad_norm": 0.06820424646139145, "kl": 0.3922602403908968, "learning_rate": 7.999907810407261e-06, "loss": -0.058, "num_tokens": 13969063.0, "reward": 0.4432871639728546, "reward_std": 1.2999380826950073, "rewards/rollout_reward_func/mean": 0.4432871639728546, "rewards/rollout_reward_func/std": 1.2999380826950073, "sampling/importance_sampling_ratio/max": 1.469085931777954, "sampling/importance_sampling_ratio/mean": 0.7584387063980103, "sampling/importance_sampling_ratio/min": 4.685374278778909e-06, "sampling/sampling_logp_difference/max": 2.2117881774902344, "sampling/sampling_logp_difference/mean": 0.3192785978317261, "step": 535, "step_time": 29.610586626979057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5387283191084862, "epoch": 0.00536, "grad_norm": 0.07581272721290588, "kl": 0.3957766145467758, "learning_rate": 7.999907440541558e-06, "loss": -0.0584, "step": 536, "step_time": 14.73171812700457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 5.78125, "completions/mean_terminated_length": 4.724137783050537, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9077542554587126, "epoch": 0.00537, "frac_reward_zero_std": 0.25, "grad_norm": 0.05894440785050392, "kl": 0.3880137410014868, "learning_rate": 7.999907069935401e-06, "loss": -0.0506, "num_tokens": 14009199.0, "reward": 1.198388695716858, "reward_std": 1.1845428943634033, "rewards/rollout_reward_func/mean": 1.198388695716858, "rewards/rollout_reward_func/std": 1.1845428943634033, "sampling/importance_sampling_ratio/max": 1.1784372329711914, "sampling/importance_sampling_ratio/mean": 0.8468233942985535, "sampling/importance_sampling_ratio/min": 0.0005743214278481901, "sampling/sampling_logp_difference/max": 1.7752859592437744, "sampling/sampling_logp_difference/mean": 0.1705302894115448, "step": 537, "step_time": 23.078938259001006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9156440161168575, "epoch": 0.00538, "grad_norm": 0.05291516333818436, "kl": 0.401084428653121, "learning_rate": 7.999906698588786e-06, "loss": -0.0508, "step": 538, "step_time": 11.441615717980312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 7.6875, "completions/mean_terminated_length": 4.9166669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8529903180897236, "epoch": 0.00539, "frac_reward_zero_std": 0.0, "grad_norm": 0.11693263053894043, "kl": 0.5407540639862418, "learning_rate": 7.999906326501715e-06, "loss": -0.0754, "num_tokens": 14058633.0, "reward": -0.2666946053504944, "reward_std": 1.1071521043777466, "rewards/rollout_reward_func/mean": -0.2666946053504944, "rewards/rollout_reward_func/std": 1.1071521043777466, "sampling/importance_sampling_ratio/max": 1.2260395288467407, "sampling/importance_sampling_ratio/mean": 0.5556653141975403, "sampling/importance_sampling_ratio/min": 7.64836727285001e-08, "sampling/sampling_logp_difference/max": 2.5311968326568604, "sampling/sampling_logp_difference/mean": 0.39236003160476685, "step": 539, "step_time": 23.902346732997103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 1.8631506636738777, "epoch": 0.0054, "grad_norm": 0.11653143167495728, "kl": 0.538751631975174, "learning_rate": 7.999905953674187e-06, "loss": -0.0763, "step": 540, "step_time": 12.116717770986725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.71875, "completions/mean_terminated_length": 4.119999885559082, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5588705763220787, "epoch": 0.00541, "frac_reward_zero_std": 0.0, "grad_norm": 0.05365607514977455, "kl": 0.8971416559070349, "learning_rate": 7.999905580106204e-06, "loss": -0.0902, "num_tokens": 14108075.0, "reward": 1.006788969039917, "reward_std": 1.2224658727645874, "rewards/rollout_reward_func/mean": 1.006788969039917, "rewards/rollout_reward_func/std": 1.2224658727645874, "sampling/importance_sampling_ratio/max": 1.364775538444519, "sampling/importance_sampling_ratio/mean": 0.7763394117355347, "sampling/importance_sampling_ratio/min": 3.046613272772447e-08, "sampling/sampling_logp_difference/max": 2.473729133605957, "sampling/sampling_logp_difference/mean": 0.3326091766357422, "step": 541, "step_time": 28.25884466797288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5713827162981033, "epoch": 0.00542, "grad_norm": 0.057236168533563614, "kl": 0.8614887539297342, "learning_rate": 7.999905205797764e-06, "loss": -0.0902, "step": 542, "step_time": 14.203117736979038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.375, "completions/mean_terminated_length": 5.833333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.0842599757015705, "epoch": 0.00543, "frac_reward_zero_std": 0.0, "grad_norm": 0.1047394722700119, "kl": 0.6407509092241526, "learning_rate": 7.999904830748868e-06, "loss": -0.0892, "num_tokens": 14159466.0, "reward": 0.3060743510723114, "reward_std": 1.439557671546936, "rewards/rollout_reward_func/mean": 0.3060743510723114, "rewards/rollout_reward_func/std": 1.439557671546936, "sampling/importance_sampling_ratio/max": 1.279478907585144, "sampling/importance_sampling_ratio/mean": 0.6491549015045166, "sampling/importance_sampling_ratio/min": 2.2181004055710218e-07, "sampling/sampling_logp_difference/max": 2.069347858428955, "sampling/sampling_logp_difference/mean": 0.430356502532959, "step": 543, "step_time": 24.882554883006378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.083754900842905, "epoch": 0.00544, "grad_norm": 0.10462454706430435, "kl": 0.6335817286744714, "learning_rate": 7.999904454959516e-06, "loss": -0.0894, "step": 544, "step_time": 12.61686081699736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 4.869565486907959, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9578859880566597, "epoch": 0.00545, "frac_reward_zero_std": 0.0, "grad_norm": 0.13088107109069824, "kl": 0.3274509748443961, "learning_rate": 7.999904078429708e-06, "loss": -0.0923, "num_tokens": 14216249.0, "reward": 0.310925155878067, "reward_std": 1.343994379043579, "rewards/rollout_reward_func/mean": 0.310925155878067, "rewards/rollout_reward_func/std": 1.343994379043579, "sampling/importance_sampling_ratio/max": 1.654751181602478, "sampling/importance_sampling_ratio/mean": 0.5909398198127747, "sampling/importance_sampling_ratio/min": 0.0009195240563713014, "sampling/sampling_logp_difference/max": 1.6762949228286743, "sampling/sampling_logp_difference/mean": 0.31216055154800415, "step": 545, "step_time": 30.065374325000448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9569661058485508, "epoch": 0.00546, "grad_norm": 0.12800616025924683, "kl": 0.3299504481256008, "learning_rate": 7.999903701159445e-06, "loss": -0.0923, "step": 546, "step_time": 13.709010744962143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 9.375, "completions/mean_terminated_length": 5.400000095367432, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.554632715880871, "epoch": 0.00547, "frac_reward_zero_std": 0.0, "grad_norm": 0.13283571600914001, "kl": 0.23123170528560877, "learning_rate": 7.999903323148725e-06, "loss": -0.0399, "num_tokens": 14277055.0, "reward": 0.21908274292945862, "reward_std": 1.1873810291290283, "rewards/rollout_reward_func/mean": 0.21908274292945862, "rewards/rollout_reward_func/std": 1.1873810291290283, "sampling/importance_sampling_ratio/max": 1.3714135885238647, "sampling/importance_sampling_ratio/mean": 0.5272529125213623, "sampling/importance_sampling_ratio/min": 7.547116354089667e-08, "sampling/sampling_logp_difference/max": 2.3456544876098633, "sampling/sampling_logp_difference/mean": 0.41264715790748596, "step": 547, "step_time": 31.730464984007995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.54608004540205, "epoch": 0.00548, "grad_norm": 0.11132454127073288, "kl": 0.23458316829055548, "learning_rate": 7.99990294439755e-06, "loss": -0.0406, "step": 548, "step_time": 13.638669111955096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.09375, "completions/mean_terminated_length": 4.882352828979492, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.504417985677719, "epoch": 0.00549, "frac_reward_zero_std": 0.0, "grad_norm": 0.07618296891450882, "kl": 0.5557089829817414, "learning_rate": 7.999902564905919e-06, "loss": -0.1052, "num_tokens": 14337688.0, "reward": 0.3997008204460144, "reward_std": 1.3220423460006714, "rewards/rollout_reward_func/mean": 0.3997008204460144, "rewards/rollout_reward_func/std": 1.3220423460006714, "sampling/importance_sampling_ratio/max": 1.147964596748352, "sampling/importance_sampling_ratio/mean": 0.46134838461875916, "sampling/importance_sampling_ratio/min": 2.885311900513443e-08, "sampling/sampling_logp_difference/max": 2.1173248291015625, "sampling/sampling_logp_difference/mean": 0.4156936705112457, "step": 549, "step_time": 35.24988787996699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.4974619299173355, "epoch": 0.0055, "grad_norm": 0.05679529160261154, "kl": 0.5098897013813257, "learning_rate": 7.999902184673833e-06, "loss": -0.1056, "step": 550, "step_time": 16.618200734010316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.71875, "completions/mean_terminated_length": 4.576923370361328, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.459775960072875, "epoch": 0.00551, "frac_reward_zero_std": 0.0, "grad_norm": 0.07013733685016632, "kl": 0.47191909700632095, "learning_rate": 7.999901803701292e-06, "loss": -0.0657, "num_tokens": 14386734.0, "reward": 0.9964228868484497, "reward_std": 1.2851808071136475, "rewards/rollout_reward_func/mean": 0.9964228868484497, "rewards/rollout_reward_func/std": 1.285180687904358, "sampling/importance_sampling_ratio/max": 1.5262681245803833, "sampling/importance_sampling_ratio/mean": 0.8502715826034546, "sampling/importance_sampling_ratio/min": 5.567665795069843e-08, "sampling/sampling_logp_difference/max": 2.1666488647460938, "sampling/sampling_logp_difference/mean": 0.3899899125099182, "step": 551, "step_time": 28.447695411989116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4475512634962797, "epoch": 0.00552, "grad_norm": 0.06332331150770187, "kl": 0.46446930803358555, "learning_rate": 7.999901421988296e-06, "loss": -0.0661, "step": 552, "step_time": 13.832695529985358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.78125, "completions/mean_terminated_length": 5.0740742683410645, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.803645133972168, "epoch": 0.00553, "frac_reward_zero_std": 0.0, "grad_norm": 0.08737089484930038, "kl": 0.4802823029458523, "learning_rate": 7.999901039534843e-06, "loss": -0.0902, "num_tokens": 14439236.0, "reward": 0.3758736848831177, "reward_std": 1.1579941511154175, "rewards/rollout_reward_func/mean": 0.3758736848831177, "rewards/rollout_reward_func/std": 1.157994031906128, "sampling/importance_sampling_ratio/max": 1.4539448022842407, "sampling/importance_sampling_ratio/mean": 0.6752393841743469, "sampling/importance_sampling_ratio/min": 1.1583839523154893e-06, "sampling/sampling_logp_difference/max": 2.0955276489257812, "sampling/sampling_logp_difference/mean": 0.35518524050712585, "step": 553, "step_time": 25.875718006005627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.79304962977767, "epoch": 0.00554, "grad_norm": 0.08794471621513367, "kl": 0.4919149950146675, "learning_rate": 7.999900656340938e-06, "loss": -0.0904, "step": 554, "step_time": 12.188871824982925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 7.53125, "completions/mean_terminated_length": 4.708333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8000672608613968, "epoch": 0.00555, "frac_reward_zero_std": 0.0, "grad_norm": 0.10209718346595764, "kl": 0.28988262452185154, "learning_rate": 7.999900272406576e-06, "loss": -0.1056, "num_tokens": 14497526.0, "reward": 0.4795461893081665, "reward_std": 1.1712740659713745, "rewards/rollout_reward_func/mean": 0.4795461893081665, "rewards/rollout_reward_func/std": 1.1712740659713745, "sampling/importance_sampling_ratio/max": 1.3625481128692627, "sampling/importance_sampling_ratio/mean": 0.7220153212547302, "sampling/importance_sampling_ratio/min": 7.252477189467754e-06, "sampling/sampling_logp_difference/max": 2.012813091278076, "sampling/sampling_logp_difference/mean": 0.3739808201789856, "step": 555, "step_time": 31.594650831000763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.790405172854662, "epoch": 0.00556, "grad_norm": 0.10290175676345825, "kl": 0.3019763585180044, "learning_rate": 7.999899887731757e-06, "loss": -0.1058, "step": 556, "step_time": 16.31113869001274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 5.6875, "completions/mean_terminated_length": 5.000000476837158, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2180471122264862, "epoch": 0.00557, "frac_reward_zero_std": 0.0, "grad_norm": 0.19167473912239075, "kl": 0.6353391967713833, "learning_rate": 7.999899502316487e-06, "loss": -0.053, "num_tokens": 14552899.0, "reward": 0.4644234776496887, "reward_std": 1.2601102590560913, "rewards/rollout_reward_func/mean": 0.4644234776496887, "rewards/rollout_reward_func/std": 1.2601101398468018, "sampling/importance_sampling_ratio/max": 1.343796730041504, "sampling/importance_sampling_ratio/mean": 0.8284797668457031, "sampling/importance_sampling_ratio/min": 1.0548779982855194e-07, "sampling/sampling_logp_difference/max": 2.6107077598571777, "sampling/sampling_logp_difference/mean": 0.3155635893344879, "step": 557, "step_time": 30.58596149100049 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.206134393811226, "epoch": 0.00558, "grad_norm": 0.10973597317934036, "kl": 0.6585676558315754, "learning_rate": 7.999899116160762e-06, "loss": -0.0531, "step": 558, "step_time": 16.309756532980828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.03125, "completions/mean_terminated_length": 4.91304349899292, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.4534912016242743, "epoch": 0.00559, "frac_reward_zero_std": 0.0, "grad_norm": 0.15209263563156128, "kl": 0.4374007824808359, "learning_rate": 7.99989872926458e-06, "loss": -0.0651, "num_tokens": 14605977.0, "reward": 0.08402694761753082, "reward_std": 1.1920580863952637, "rewards/rollout_reward_func/mean": 0.08402694761753082, "rewards/rollout_reward_func/std": 1.1920579671859741, "sampling/importance_sampling_ratio/max": 1.8372588157653809, "sampling/importance_sampling_ratio/mean": 0.7227039337158203, "sampling/importance_sampling_ratio/min": 6.93762842729484e-08, "sampling/sampling_logp_difference/max": 2.426513433456421, "sampling/sampling_logp_difference/mean": 0.4779719114303589, "step": 559, "step_time": 30.357461091000005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.4482246497645974, "epoch": 0.0056, "grad_norm": 0.1600572168827057, "kl": 0.42397612519562244, "learning_rate": 7.999898341627945e-06, "loss": -0.0653, "step": 560, "step_time": 14.372852728003636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 8.5625, "completions/mean_terminated_length": 5.18181848526001, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.941722328774631, "epoch": 0.00561, "frac_reward_zero_std": 0.0, "grad_norm": 0.0570194348692894, "kl": 0.5103258201852441, "learning_rate": 7.999897953250855e-06, "loss": -0.0899, "num_tokens": 14663484.0, "reward": 0.4671339988708496, "reward_std": 1.1971153020858765, "rewards/rollout_reward_func/mean": 0.4671339988708496, "rewards/rollout_reward_func/std": 1.1971153020858765, "sampling/importance_sampling_ratio/max": 1.243234634399414, "sampling/importance_sampling_ratio/mean": 0.6707671880722046, "sampling/importance_sampling_ratio/min": 0.0001571256434544921, "sampling/sampling_logp_difference/max": 1.5693891048431396, "sampling/sampling_logp_difference/mean": 0.32729649543762207, "step": 561, "step_time": 30.81058567999571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.943136223591864, "epoch": 0.00562, "grad_norm": 0.05379649996757507, "kl": 0.49178182799369097, "learning_rate": 7.999897564133312e-06, "loss": -0.09, "step": 562, "step_time": 14.129486996986088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.59375, "completions/mean_terminated_length": 5.695652484893799, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.283021092414856, "epoch": 0.00563, "frac_reward_zero_std": 0.0, "grad_norm": 0.055693984031677246, "kl": 0.2515510912053287, "learning_rate": 7.999897174275314e-06, "loss": -0.0804, "num_tokens": 14720833.0, "reward": 0.38537171483039856, "reward_std": 1.2655560970306396, "rewards/rollout_reward_func/mean": 0.38537171483039856, "rewards/rollout_reward_func/std": 1.2655560970306396, "sampling/importance_sampling_ratio/max": 1.4499306678771973, "sampling/importance_sampling_ratio/mean": 0.6368168592453003, "sampling/importance_sampling_ratio/min": 8.64985031512333e-06, "sampling/sampling_logp_difference/max": 1.7577106952667236, "sampling/sampling_logp_difference/mean": 0.38865363597869873, "step": 563, "step_time": 31.222166579012992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.292959675192833, "epoch": 0.00564, "grad_norm": 0.05804044008255005, "kl": 0.25944541161879897, "learning_rate": 7.999896783676862e-06, "loss": -0.0803, "step": 564, "step_time": 15.093621200008783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.8125, "completions/mean_terminated_length": 5.047619342803955, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.0347879268229008, "epoch": 0.00565, "frac_reward_zero_std": 0.0, "grad_norm": 0.031925737857818604, "kl": 0.38168207462877035, "learning_rate": 7.999896392337957e-06, "loss": -0.0737, "num_tokens": 14768955.0, "reward": 0.6827396154403687, "reward_std": 1.3878133296966553, "rewards/rollout_reward_func/mean": 0.6827396154403687, "rewards/rollout_reward_func/std": 1.3878133296966553, "sampling/importance_sampling_ratio/max": 1.5779392719268799, "sampling/importance_sampling_ratio/mean": 0.6249752044677734, "sampling/importance_sampling_ratio/min": 7.1975068749452475e-06, "sampling/sampling_logp_difference/max": 2.486691474914551, "sampling/sampling_logp_difference/mean": 0.40662193298339844, "step": 565, "step_time": 26.55692573300621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0409297859296203, "epoch": 0.00566, "grad_norm": 0.03048982098698616, "kl": 0.35940985661000013, "learning_rate": 7.999896000258596e-06, "loss": -0.0736, "step": 566, "step_time": 13.094898448005551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 9.1875, "completions/mean_terminated_length": 6.090909481048584, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.3950178772211075, "epoch": 0.00567, "frac_reward_zero_std": 0.0, "grad_norm": 0.10839156061410904, "kl": 0.2783849257975817, "learning_rate": 7.999895607438783e-06, "loss": -0.0557, "num_tokens": 14831904.0, "reward": -0.04960349202156067, "reward_std": 1.0559018850326538, "rewards/rollout_reward_func/mean": -0.04960349202156067, "rewards/rollout_reward_func/std": 1.0559018850326538, "sampling/importance_sampling_ratio/max": 1.538567304611206, "sampling/importance_sampling_ratio/mean": 0.5607859492301941, "sampling/importance_sampling_ratio/min": 9.380144661008671e-09, "sampling/sampling_logp_difference/max": 2.4044861793518066, "sampling/sampling_logp_difference/mean": 0.4148266315460205, "step": 567, "step_time": 30.77636935201008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.393392950296402, "epoch": 0.00568, "grad_norm": 0.10869723558425903, "kl": 0.2768871337175369, "learning_rate": 7.999895213878515e-06, "loss": -0.0559, "step": 568, "step_time": 13.871767750009894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.5, "completions/mean_terminated_length": 4.6666669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.32887252420187, "epoch": 0.00569, "frac_reward_zero_std": 0.0, "grad_norm": 0.10320425778627396, "kl": 0.34474798664450645, "learning_rate": 7.999894819577795e-06, "loss": -0.0662, "num_tokens": 14892086.0, "reward": 0.46916550397872925, "reward_std": 1.3027154207229614, "rewards/rollout_reward_func/mean": 0.46916550397872925, "rewards/rollout_reward_func/std": 1.3027154207229614, "sampling/importance_sampling_ratio/max": 1.471123218536377, "sampling/importance_sampling_ratio/mean": 0.8412514328956604, "sampling/importance_sampling_ratio/min": 7.906380778877065e-05, "sampling/sampling_logp_difference/max": 1.6652185916900635, "sampling/sampling_logp_difference/mean": 0.26583802700042725, "step": 569, "step_time": 29.34839489700971 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "entropy": 1.3396892622113228, "epoch": 0.0057, "grad_norm": 0.10881786793470383, "kl": 0.3344489000737667, "learning_rate": 7.99989442453662e-06, "loss": -0.0664, "step": 570, "step_time": 14.51363709999714 }, { "clip_ratio/high_max": 0.0062500000931322575, "clip_ratio/high_mean": 0.0031250000465661287, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 8.90625, "completions/mean_terminated_length": 5.68181848526001, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.795258589088917, "epoch": 0.00571, "frac_reward_zero_std": 0.0, "grad_norm": 0.08358745276927948, "kl": 0.46301653049886227, "learning_rate": 7.999894028754991e-06, "loss": -0.0498, "num_tokens": 14948529.0, "reward": -0.10663319379091263, "reward_std": 1.067921757698059, "rewards/rollout_reward_func/mean": -0.10663319379091263, "rewards/rollout_reward_func/std": 1.067921757698059, "sampling/importance_sampling_ratio/max": 1.4639583826065063, "sampling/importance_sampling_ratio/mean": 0.44453179836273193, "sampling/importance_sampling_ratio/min": 1.69701752383844e-06, "sampling/sampling_logp_difference/max": 1.7409350872039795, "sampling/sampling_logp_difference/mean": 0.48495733737945557, "step": 571, "step_time": 27.249330981983803 }, { "clip_ratio/high_max": 0.028977273497730494, "clip_ratio/high_mean": 0.014488636748865247, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014488636748865247, "entropy": 2.8005161359906197, "epoch": 0.00572, "grad_norm": 0.08368400484323502, "kl": 0.43213894683867693, "learning_rate": 7.99989363223291e-06, "loss": -0.0503, "step": 572, "step_time": 12.550388157978887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.78125, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7973697632551193, "epoch": 0.00573, "frac_reward_zero_std": 0.0, "grad_norm": 0.1338677704334259, "kl": 0.4118153234012425, "learning_rate": 7.999893234970375e-06, "loss": -0.037, "num_tokens": 15000498.0, "reward": 0.058193355798721313, "reward_std": 1.0862879753112793, "rewards/rollout_reward_func/mean": 0.058193355798721313, "rewards/rollout_reward_func/std": 1.0862879753112793, "sampling/importance_sampling_ratio/max": 1.5863133668899536, "sampling/importance_sampling_ratio/mean": 0.6582804918289185, "sampling/importance_sampling_ratio/min": 1.4413366216103896e-06, "sampling/sampling_logp_difference/max": 1.8859485387802124, "sampling/sampling_logp_difference/mean": 0.31306713819503784, "step": 573, "step_time": 34.147192110991455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8032907880842686, "epoch": 0.00574, "grad_norm": 0.13670037686824799, "kl": 0.39275691378861666, "learning_rate": 7.999892836967388e-06, "loss": -0.0376, "step": 574, "step_time": 14.183835360992816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.34375, "completions/mean_terminated_length": 4.789473533630371, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.4788311310112476, "epoch": 0.00575, "frac_reward_zero_std": 0.0, "grad_norm": 0.16129741072654724, "kl": 0.11395248584449291, "learning_rate": 7.999892438223946e-06, "loss": -0.0441, "num_tokens": 15061111.0, "reward": -0.4035740792751312, "reward_std": 0.7623164057731628, "rewards/rollout_reward_func/mean": -0.4035740792751312, "rewards/rollout_reward_func/std": 0.7623164057731628, "sampling/importance_sampling_ratio/max": 1.4709399938583374, "sampling/importance_sampling_ratio/mean": 0.5931105613708496, "sampling/importance_sampling_ratio/min": 1.1889574835777239e-07, "sampling/sampling_logp_difference/max": 2.1272661685943604, "sampling/sampling_logp_difference/mean": 0.4220258593559265, "step": 575, "step_time": 30.863414087012643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 2.4917709305882454, "epoch": 0.00576, "grad_norm": 0.12407448887825012, "kl": 0.10897193802520633, "learning_rate": 7.999892038740054e-06, "loss": -0.0449, "step": 576, "step_time": 13.59342975000618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 7.46875, "completions/mean_terminated_length": 4.625, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7599961459636688, "epoch": 0.00577, "frac_reward_zero_std": 0.0, "grad_norm": 0.12039511650800705, "kl": 0.2813537325710058, "learning_rate": 7.999891638515707e-06, "loss": -0.1028, "num_tokens": 15118474.0, "reward": 0.4889097511768341, "reward_std": 1.3531434535980225, "rewards/rollout_reward_func/mean": 0.4889097511768341, "rewards/rollout_reward_func/std": 1.353143334388733, "sampling/importance_sampling_ratio/max": 1.4161888360977173, "sampling/importance_sampling_ratio/mean": 0.7287424802780151, "sampling/importance_sampling_ratio/min": 8.248111043940298e-06, "sampling/sampling_logp_difference/max": 1.9602928161621094, "sampling/sampling_logp_difference/mean": 0.31169408559799194, "step": 577, "step_time": 29.524277730015456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.765276774764061, "epoch": 0.00578, "grad_norm": 0.12572526931762695, "kl": 0.2705706339329481, "learning_rate": 7.999891237550908e-06, "loss": -0.1026, "step": 578, "step_time": 13.555220456968527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.125, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.2043787948787212, "epoch": 0.00579, "frac_reward_zero_std": 0.0, "grad_norm": 0.0903603658080101, "kl": 0.17627239832654595, "learning_rate": 7.999890835845657e-06, "loss": -0.1116, "num_tokens": 15182154.0, "reward": 0.5475239753723145, "reward_std": 1.211885690689087, "rewards/rollout_reward_func/mean": 0.5475239753723145, "rewards/rollout_reward_func/std": 1.211885690689087, "sampling/importance_sampling_ratio/max": 1.6576921939849854, "sampling/importance_sampling_ratio/mean": 0.6990126967430115, "sampling/importance_sampling_ratio/min": 2.942281525974977e-06, "sampling/sampling_logp_difference/max": 2.0835108757019043, "sampling/sampling_logp_difference/mean": 0.3605726361274719, "step": 579, "step_time": 32.98662083498493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.201514959335327, "epoch": 0.0058, "grad_norm": 0.08425412327051163, "kl": 0.17829539068043232, "learning_rate": 7.999890433399953e-06, "loss": -0.1119, "step": 580, "step_time": 14.380101607006509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.125, "completions/mean_terminated_length": 6.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.6366520076990128, "epoch": 0.00581, "frac_reward_zero_std": 0.0, "grad_norm": 0.31842002272605896, "kl": 0.11977029591798782, "learning_rate": 7.999890030213796e-06, "loss": -0.082, "num_tokens": 15240643.0, "reward": 0.6236342191696167, "reward_std": 1.2032153606414795, "rewards/rollout_reward_func/mean": 0.6236342191696167, "rewards/rollout_reward_func/std": 1.2032153606414795, "sampling/importance_sampling_ratio/max": 1.5690510272979736, "sampling/importance_sampling_ratio/mean": 0.543911337852478, "sampling/importance_sampling_ratio/min": 4.0038462429947685e-06, "sampling/sampling_logp_difference/max": 2.244922637939453, "sampling/sampling_logp_difference/mean": 0.41525572538375854, "step": 581, "step_time": 38.766914970008656 }, { "clip_ratio/high_max": 0.046875, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 2.6286687403917313, "epoch": 0.00582, "grad_norm": 0.17878685891628265, "kl": 0.12614384107291698, "learning_rate": 7.999889626287187e-06, "loss": -0.0839, "step": 582, "step_time": 15.83342911901127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 7.636363983154297, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.1111276745796204, "epoch": 0.00583, "frac_reward_zero_std": 0.0, "grad_norm": 0.06956440210342407, "kl": 0.2704102098941803, "learning_rate": 7.999889221620126e-06, "loss": -0.0638, "num_tokens": 15303357.0, "reward": 0.33304521441459656, "reward_std": 1.357638955116272, "rewards/rollout_reward_func/mean": 0.33304521441459656, "rewards/rollout_reward_func/std": 1.357638955116272, "sampling/importance_sampling_ratio/max": 1.3191869258880615, "sampling/importance_sampling_ratio/mean": 0.38701605796813965, "sampling/importance_sampling_ratio/min": 1.4524006441973825e-09, "sampling/sampling_logp_difference/max": 2.2271604537963867, "sampling/sampling_logp_difference/mean": 0.5402607321739197, "step": 583, "step_time": 37.068655555005535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.1089356392621994, "epoch": 0.00584, "grad_norm": 0.0783088207244873, "kl": 0.33528626896440983, "learning_rate": 7.999888816212612e-06, "loss": -0.0637, "step": 584, "step_time": 15.967046583013143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0036764706019312143, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0036764706019312143, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.125, "completions/mean_terminated_length": 5.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.051506135612726, "epoch": 0.00585, "frac_reward_zero_std": 0.0, "grad_norm": 0.138629749417305, "kl": 0.8529120683670044, "learning_rate": 7.999888410064647e-06, "loss": -0.0917, "num_tokens": 15364735.0, "reward": 0.5326781272888184, "reward_std": 1.3199595212936401, "rewards/rollout_reward_func/mean": 0.5326781272888184, "rewards/rollout_reward_func/std": 1.3199595212936401, "sampling/importance_sampling_ratio/max": 1.8982584476470947, "sampling/importance_sampling_ratio/mean": 0.6583524942398071, "sampling/importance_sampling_ratio/min": 2.167849743273109e-06, "sampling/sampling_logp_difference/max": 2.631161689758301, "sampling/sampling_logp_difference/mean": 0.40038228034973145, "step": 585, "step_time": 34.45415689497895 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0036764706019312143, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011488970601931214, "entropy": 2.043198250234127, "epoch": 0.00586, "grad_norm": 0.14184239506721497, "kl": 0.93575194850564, "learning_rate": 7.99988800317623e-06, "loss": -0.092, "step": 586, "step_time": 16.24417008300952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 6.3125, "completions/mean_terminated_length": 4.928571701049805, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.454863991588354, "epoch": 0.00587, "frac_reward_zero_std": 0.0, "grad_norm": 0.1913159191608429, "kl": 0.42964744567871094, "learning_rate": 7.99988759554736e-06, "loss": -0.0663, "num_tokens": 15430420.0, "reward": 0.3553842306137085, "reward_std": 1.2249133586883545, "rewards/rollout_reward_func/mean": 0.3553842306137085, "rewards/rollout_reward_func/std": 1.2249133586883545, "sampling/importance_sampling_ratio/max": 1.3122762441635132, "sampling/importance_sampling_ratio/mean": 0.7642481327056885, "sampling/importance_sampling_ratio/min": 1.6256648223134107e-06, "sampling/sampling_logp_difference/max": 1.9892578125, "sampling/sampling_logp_difference/mean": 0.2777494192123413, "step": 587, "step_time": 34.36846889401204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4529114291071892, "epoch": 0.00588, "grad_norm": 0.1846769154071808, "kl": 0.42569420486688614, "learning_rate": 7.999887187178037e-06, "loss": -0.0662, "step": 588, "step_time": 16.794866713986266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.15625, "completions/mean_terminated_length": 5.5416669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.07628283649683, "epoch": 0.00589, "frac_reward_zero_std": 0.0, "grad_norm": 0.11615214496850967, "kl": 0.44070179061964154, "learning_rate": 7.999886778068266e-06, "loss": -0.0949, "num_tokens": 15482035.0, "reward": 0.5769476294517517, "reward_std": 1.2854543924331665, "rewards/rollout_reward_func/mean": 0.5769476294517517, "rewards/rollout_reward_func/std": 1.2854543924331665, "sampling/importance_sampling_ratio/max": 1.2288120985031128, "sampling/importance_sampling_ratio/mean": 0.6324547529220581, "sampling/importance_sampling_ratio/min": 5.561123089137254e-06, "sampling/sampling_logp_difference/max": 1.9070279598236084, "sampling/sampling_logp_difference/mean": 0.39107847213745117, "step": 589, "step_time": 29.98859711199475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.073525495827198, "epoch": 0.0059, "grad_norm": 0.11817717552185059, "kl": 0.45272359484806657, "learning_rate": 7.999886368218042e-06, "loss": -0.0949, "step": 590, "step_time": 13.511945107980864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.625, "completions/mean_terminated_length": 4.888888835906982, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3429104145616293, "epoch": 0.00591, "frac_reward_zero_std": 0.25, "grad_norm": 0.0839308351278305, "kl": 1.1709339572116733, "learning_rate": 7.999885957627366e-06, "loss": -0.0765, "num_tokens": 15529502.0, "reward": 1.2838786840438843, "reward_std": 1.1633765697479248, "rewards/rollout_reward_func/mean": 1.2838786840438843, "rewards/rollout_reward_func/std": 1.1633765697479248, "sampling/importance_sampling_ratio/max": 1.2563114166259766, "sampling/importance_sampling_ratio/mean": 0.7731651067733765, "sampling/importance_sampling_ratio/min": 0.00023326628434006125, "sampling/sampling_logp_difference/max": 1.8546686172485352, "sampling/sampling_logp_difference/mean": 0.2572508752346039, "step": 591, "step_time": 29.83514159799961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3358498802408576, "epoch": 0.00592, "grad_norm": 0.0826646015048027, "kl": 1.1669912338256836, "learning_rate": 7.99988554629624e-06, "loss": -0.0766, "step": 592, "step_time": 14.925762629020028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.53125, "completions/mean_terminated_length": 4.448276042938232, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9863330107182264, "epoch": 0.00593, "frac_reward_zero_std": 0.25, "grad_norm": 0.06988105922937393, "kl": 0.27085138484835625, "learning_rate": 7.999885134224663e-06, "loss": -0.0579, "num_tokens": 15566976.0, "reward": 1.0295469760894775, "reward_std": 1.401397705078125, "rewards/rollout_reward_func/mean": 1.0295469760894775, "rewards/rollout_reward_func/std": 1.4013975858688354, "sampling/importance_sampling_ratio/max": 1.2924866676330566, "sampling/importance_sampling_ratio/mean": 0.9506821036338806, "sampling/importance_sampling_ratio/min": 5.09272513227188e-06, "sampling/sampling_logp_difference/max": 1.966510534286499, "sampling/sampling_logp_difference/mean": 0.24233807623386383, "step": 593, "step_time": 17.440520027987077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9933901242911816, "epoch": 0.00594, "grad_norm": 0.06916726380586624, "kl": 0.2683302368968725, "learning_rate": 7.999884721412632e-06, "loss": -0.0579, "step": 594, "step_time": 10.299013730022125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.21875, "completions/mean_terminated_length": 6.136363983154297, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.58387553691864, "epoch": 0.00595, "frac_reward_zero_std": 0.0, "grad_norm": 0.04924781620502472, "kl": 0.31288338731974363, "learning_rate": 7.999884307860151e-06, "loss": -0.0924, "num_tokens": 15623906.0, "reward": 0.33408090472221375, "reward_std": 1.3419057130813599, "rewards/rollout_reward_func/mean": 0.33408090472221375, "rewards/rollout_reward_func/std": 1.3419057130813599, "sampling/importance_sampling_ratio/max": 1.2718473672866821, "sampling/importance_sampling_ratio/mean": 0.49916213750839233, "sampling/importance_sampling_ratio/min": 1.6661076074342418e-08, "sampling/sampling_logp_difference/max": 2.0014920234680176, "sampling/sampling_logp_difference/mean": 0.47348934412002563, "step": 595, "step_time": 32.83111407702381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.5861090272665024, "epoch": 0.00596, "grad_norm": 0.04856574907898903, "kl": 0.319547601044178, "learning_rate": 7.999883893567221e-06, "loss": -0.0925, "step": 596, "step_time": 14.18051973299589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.125, "completions/mean_terminated_length": 5.523809432983398, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.420226275920868, "epoch": 0.00597, "frac_reward_zero_std": 0.0, "grad_norm": 0.2640339136123657, "kl": 0.1875306675210595, "learning_rate": 7.999883478533838e-06, "loss": -0.1148, "num_tokens": 15685968.0, "reward": 0.052030935883522034, "reward_std": 1.1741068363189697, "rewards/rollout_reward_func/mean": 0.052030935883522034, "rewards/rollout_reward_func/std": 1.1741067171096802, "sampling/importance_sampling_ratio/max": 1.558842420578003, "sampling/importance_sampling_ratio/mean": 0.6105977296829224, "sampling/importance_sampling_ratio/min": 3.811869646597188e-07, "sampling/sampling_logp_difference/max": 2.1461801528930664, "sampling/sampling_logp_difference/mean": 0.405228853225708, "step": 597, "step_time": 38.14212304602552 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "entropy": 2.417830988764763, "epoch": 0.00598, "grad_norm": 0.14989599585533142, "kl": 0.18953272607177496, "learning_rate": 7.999883062760005e-06, "loss": -0.1156, "step": 598, "step_time": 16.50434514299559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.09375, "completions/mean_terminated_length": 5.44444465637207, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.637983338907361, "epoch": 0.00599, "frac_reward_zero_std": 0.0, "grad_norm": 0.05811268463730812, "kl": 0.928349856287241, "learning_rate": 7.999882646245722e-06, "loss": -0.0862, "num_tokens": 15733100.0, "reward": 1.0926520824432373, "reward_std": 1.2166281938552856, "rewards/rollout_reward_func/mean": 1.0926520824432373, "rewards/rollout_reward_func/std": 1.2166281938552856, "sampling/importance_sampling_ratio/max": 1.2070780992507935, "sampling/importance_sampling_ratio/mean": 0.7507331371307373, "sampling/importance_sampling_ratio/min": 1.0972862583003007e-06, "sampling/sampling_logp_difference/max": 2.849269390106201, "sampling/sampling_logp_difference/mean": 0.36507928371429443, "step": 599, "step_time": 25.386734177023754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.633674655109644, "epoch": 0.006, "grad_norm": 0.06968498975038528, "kl": 0.9900916442275047, "learning_rate": 7.999882228990988e-06, "loss": -0.086, "step": 600, "step_time": 13.78954483299458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.15625, "completions/mean_terminated_length": 5.115384578704834, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8818899616599083, "epoch": 0.00601, "frac_reward_zero_std": 0.25, "grad_norm": 0.062437091022729874, "kl": 0.1880307588726282, "learning_rate": 7.999881810995804e-06, "loss": -0.0777, "num_tokens": 15788990.0, "reward": 0.8365066051483154, "reward_std": 1.2526366710662842, "rewards/rollout_reward_func/mean": 0.8365066051483154, "rewards/rollout_reward_func/std": 1.2526365518569946, "sampling/importance_sampling_ratio/max": 1.2771059274673462, "sampling/importance_sampling_ratio/mean": 0.7603503465652466, "sampling/importance_sampling_ratio/min": 4.6054748963797465e-05, "sampling/sampling_logp_difference/max": 1.9372588396072388, "sampling/sampling_logp_difference/mean": 0.31621646881103516, "step": 601, "step_time": 31.98066926500178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.874633714556694, "epoch": 0.00602, "grad_norm": 0.0560297966003418, "kl": 0.18818902131170034, "learning_rate": 7.99988139226017e-06, "loss": -0.078, "step": 602, "step_time": 14.962101566008641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.78125, "completions/mean_terminated_length": 6.050000190734863, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.28952931240201, "epoch": 0.00603, "frac_reward_zero_std": 0.0, "grad_norm": 0.16234023869037628, "kl": 1.0392267275601625, "learning_rate": 7.999880972784085e-06, "loss": -0.044, "num_tokens": 15854170.0, "reward": -0.29010823369026184, "reward_std": 0.9671831130981445, "rewards/rollout_reward_func/mean": -0.29010823369026184, "rewards/rollout_reward_func/std": 0.9671831130981445, "sampling/importance_sampling_ratio/max": 1.5671706199645996, "sampling/importance_sampling_ratio/mean": 0.5280386805534363, "sampling/importance_sampling_ratio/min": 1.6168734191523981e-06, "sampling/sampling_logp_difference/max": 2.1691079139709473, "sampling/sampling_logp_difference/mean": 0.40470823645591736, "step": 603, "step_time": 35.067822162018274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.284081345424056, "epoch": 0.00604, "grad_norm": 0.15744437277317047, "kl": 1.07672237791121, "learning_rate": 7.99988055256755e-06, "loss": -0.044, "step": 604, "step_time": 14.779440853992128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.84375, "completions/mean_terminated_length": 4.136363983154297, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7251327065750957, "epoch": 0.00605, "frac_reward_zero_std": 0.25, "grad_norm": 0.10191750526428223, "kl": 0.4885296653956175, "learning_rate": 7.999880131610566e-06, "loss": -0.0087, "num_tokens": 15905847.0, "reward": 0.04966059327125549, "reward_std": 1.3027878999710083, "rewards/rollout_reward_func/mean": 0.04966059327125549, "rewards/rollout_reward_func/std": 1.3027877807617188, "sampling/importance_sampling_ratio/max": 1.2063336372375488, "sampling/importance_sampling_ratio/mean": 0.6659730672836304, "sampling/importance_sampling_ratio/min": 5.608263009548864e-08, "sampling/sampling_logp_difference/max": 1.921197533607483, "sampling/sampling_logp_difference/mean": 0.36530739068984985, "step": 605, "step_time": 31.582769163011108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.728381671011448, "epoch": 0.00606, "grad_norm": 0.09188174456357956, "kl": 0.428864074870944, "learning_rate": 7.99987970991313e-06, "loss": -0.0091, "step": 606, "step_time": 14.116818086025887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.4375, "completions/mean_terminated_length": 4.583333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6270694499835372, "epoch": 0.00607, "frac_reward_zero_std": 0.25, "grad_norm": 0.014950016513466835, "kl": 0.4486172366887331, "learning_rate": 7.999879287475248e-06, "loss": -0.0732, "num_tokens": 15953107.0, "reward": 1.139967918395996, "reward_std": 1.29047691822052, "rewards/rollout_reward_func/mean": 1.139967918395996, "rewards/rollout_reward_func/std": 1.2904770374298096, "sampling/importance_sampling_ratio/max": 1.3581795692443848, "sampling/importance_sampling_ratio/mean": 0.743272066116333, "sampling/importance_sampling_ratio/min": 2.62511930486653e-05, "sampling/sampling_logp_difference/max": 2.151707172393799, "sampling/sampling_logp_difference/mean": 0.3016769289970398, "step": 607, "step_time": 28.15901744797884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6283651180565357, "epoch": 0.00608, "grad_norm": 0.015119686722755432, "kl": 0.44841770827770233, "learning_rate": 7.999878864296913e-06, "loss": -0.0732, "step": 608, "step_time": 14.089952469003038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.15625, "completions/mean_terminated_length": 5.115384578704834, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.746300682425499, "epoch": 0.00609, "frac_reward_zero_std": 0.25, "grad_norm": 0.18816538155078888, "kl": 0.3628408145159483, "learning_rate": 7.999878440378129e-06, "loss": -0.0528, "num_tokens": 16009172.0, "reward": 0.3244937062263489, "reward_std": 1.1001571416854858, "rewards/rollout_reward_func/mean": 0.3244937062263489, "rewards/rollout_reward_func/std": 1.1001571416854858, "sampling/importance_sampling_ratio/max": 1.2829104661941528, "sampling/importance_sampling_ratio/mean": 0.6929024457931519, "sampling/importance_sampling_ratio/min": 0.0002664932399056852, "sampling/sampling_logp_difference/max": 1.618696928024292, "sampling/sampling_logp_difference/mean": 0.3140292763710022, "step": 609, "step_time": 31.541972362974775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7569177895784378, "epoch": 0.0061, "grad_norm": 0.18392810225486755, "kl": 0.328449372202158, "learning_rate": 7.999878015718894e-06, "loss": -0.0531, "step": 610, "step_time": 13.769554483005777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.34375, "completions/mean_terminated_length": 4.920000076293945, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7568169496953487, "epoch": 0.00611, "frac_reward_zero_std": 0.25, "grad_norm": 0.1534217745065689, "kl": 0.346065491437912, "learning_rate": 7.999877590319212e-06, "loss": -0.0603, "num_tokens": 16064548.0, "reward": 0.4134804606437683, "reward_std": 1.3415592908859253, "rewards/rollout_reward_func/mean": 0.4134804606437683, "rewards/rollout_reward_func/std": 1.3415592908859253, "sampling/importance_sampling_ratio/max": 1.3130033016204834, "sampling/importance_sampling_ratio/mean": 0.6410002708435059, "sampling/importance_sampling_ratio/min": 1.0136326267229379e-07, "sampling/sampling_logp_difference/max": 1.7698407173156738, "sampling/sampling_logp_difference/mean": 0.3106573820114136, "step": 611, "step_time": 31.78720730401983 }, { "clip_ratio/high_max": 0.01785714365541935, "clip_ratio/high_mean": 0.008928571827709675, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008928571827709675, "entropy": 1.7686728071421385, "epoch": 0.00612, "grad_norm": 0.0836997851729393, "kl": 0.31232325453311205, "learning_rate": 7.99987716417908e-06, "loss": -0.0612, "step": 612, "step_time": 13.611304856982315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.8125, "completions/mean_terminated_length": 5.083333492279053, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.8384812735021114, "epoch": 0.00613, "frac_reward_zero_std": 0.25, "grad_norm": 0.06416532397270203, "kl": 0.2506912243552506, "learning_rate": 7.999876737298501e-06, "loss": -0.0632, "num_tokens": 16119788.0, "reward": 0.6609145402908325, "reward_std": 1.2546266317367554, "rewards/rollout_reward_func/mean": 0.6609145402908325, "rewards/rollout_reward_func/std": 1.2546265125274658, "sampling/importance_sampling_ratio/max": 1.168152093887329, "sampling/importance_sampling_ratio/mean": 0.6632275581359863, "sampling/importance_sampling_ratio/min": 2.81234024441801e-05, "sampling/sampling_logp_difference/max": 1.7048438787460327, "sampling/sampling_logp_difference/mean": 0.30975449085235596, "step": 613, "step_time": 34.9421195120085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8537157699465752, "epoch": 0.00614, "grad_norm": 0.06612294912338257, "kl": 0.2452445076778531, "learning_rate": 7.99987630967747e-06, "loss": -0.0636, "step": 614, "step_time": 14.94164768900373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004166666883975267, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004166666883975267, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 5.1875, "completions/mean_terminated_length": 4.4666666984558105, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8517725057899952, "epoch": 0.00615, "frac_reward_zero_std": 0.0, "grad_norm": 0.030005797743797302, "kl": 0.8567212913185358, "learning_rate": 7.99987588131599e-06, "loss": -0.0734, "num_tokens": 16155328.0, "reward": 0.9598181247711182, "reward_std": 1.3779704570770264, "rewards/rollout_reward_func/mean": 0.9598181247711182, "rewards/rollout_reward_func/std": 1.3779704570770264, "sampling/importance_sampling_ratio/max": 1.2184984683990479, "sampling/importance_sampling_ratio/mean": 0.8771871328353882, "sampling/importance_sampling_ratio/min": 1.5050271429117856e-07, "sampling/sampling_logp_difference/max": 2.0479514598846436, "sampling/sampling_logp_difference/mean": 0.23603466153144836, "step": 615, "step_time": 19.896593605022645 }, { "clip_ratio/high_max": 0.01657197019085288, "clip_ratio/high_mean": 0.00828598509542644, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00828598509542644, "entropy": 0.8617645371705294, "epoch": 0.00616, "grad_norm": 0.02975698560476303, "kl": 0.8071750849485397, "learning_rate": 7.999875452214063e-06, "loss": -0.0735, "step": 616, "step_time": 10.947715884016361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.21875, "completions/mean_terminated_length": 5.625, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8802894223481417, "epoch": 0.00617, "frac_reward_zero_std": 0.0, "grad_norm": 0.10517159104347229, "kl": 0.4887454239651561, "learning_rate": 7.999875022371686e-06, "loss": -0.0312, "num_tokens": 16217634.0, "reward": 0.36147475242614746, "reward_std": 1.2978452444076538, "rewards/rollout_reward_func/mean": 0.36147475242614746, "rewards/rollout_reward_func/std": 1.2978452444076538, "sampling/importance_sampling_ratio/max": 1.2396477460861206, "sampling/importance_sampling_ratio/mean": 0.6546409130096436, "sampling/importance_sampling_ratio/min": 0.00025148625718429685, "sampling/sampling_logp_difference/max": 1.8225443363189697, "sampling/sampling_logp_difference/mean": 0.30883821845054626, "step": 617, "step_time": 32.262390191986924 }, { "clip_ratio/high_max": 0.012787723913788795, "clip_ratio/high_mean": 0.006393861956894398, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006393861956894398, "entropy": 1.8919894192367792, "epoch": 0.00618, "grad_norm": 0.10721515864133835, "kl": 0.4360720692202449, "learning_rate": 7.99987459178886e-06, "loss": -0.0321, "step": 618, "step_time": 14.4273821139941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.96875, "completions/mean_terminated_length": 5.2916669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.968897427432239, "epoch": 0.00619, "frac_reward_zero_std": 0.0, "grad_norm": 0.07498227059841156, "kl": 0.2391004730015993, "learning_rate": 7.999874160465587e-06, "loss": -0.1001, "num_tokens": 16272918.0, "reward": 0.7159085273742676, "reward_std": 1.3414108753204346, "rewards/rollout_reward_func/mean": 0.7159085273742676, "rewards/rollout_reward_func/std": 1.3414108753204346, "sampling/importance_sampling_ratio/max": 1.2428988218307495, "sampling/importance_sampling_ratio/mean": 0.6523228883743286, "sampling/importance_sampling_ratio/min": 6.313434369076276e-06, "sampling/sampling_logp_difference/max": 2.0467429161071777, "sampling/sampling_logp_difference/mean": 0.3766807019710541, "step": 619, "step_time": 29.181405721028568 }, { "clip_ratio/high_max": 0.004464285913854837, "clip_ratio/high_mean": 0.0022321429569274187, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0022321429569274187, "entropy": 1.9958111960440874, "epoch": 0.0062, "grad_norm": 0.07897111773490906, "kl": 0.22619786765426397, "learning_rate": 7.999873728401865e-06, "loss": -0.1002, "step": 620, "step_time": 12.783004614015226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.59375, "completions/mean_terminated_length": 5.695652484893799, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.591847151517868, "epoch": 0.00621, "frac_reward_zero_std": 0.0, "grad_norm": 0.1094328835606575, "kl": 0.2752160709351301, "learning_rate": 7.999873295597694e-06, "loss": -0.082, "num_tokens": 16329584.0, "reward": 0.0064402371644973755, "reward_std": 1.2192420959472656, "rewards/rollout_reward_func/mean": 0.0064402371644973755, "rewards/rollout_reward_func/std": 1.219241976737976, "sampling/importance_sampling_ratio/max": 1.1952382326126099, "sampling/importance_sampling_ratio/mean": 0.5668253898620605, "sampling/importance_sampling_ratio/min": 2.176335556214326e-07, "sampling/sampling_logp_difference/max": 2.4931812286376953, "sampling/sampling_logp_difference/mean": 0.43784278631210327, "step": 621, "step_time": 31.645704119015136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.6032126545906067, "epoch": 0.00622, "grad_norm": 0.11030545085668564, "kl": 0.2683875309303403, "learning_rate": 7.999872862053077e-06, "loss": -0.0819, "step": 622, "step_time": 15.78850191499805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.65625, "completions/mean_terminated_length": 5.730769634246826, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7632246371358633, "epoch": 0.00623, "frac_reward_zero_std": 0.0, "grad_norm": 0.09699016064405441, "kl": 0.19440038315951824, "learning_rate": 7.99987242776801e-06, "loss": -0.0838, "num_tokens": 16374655.0, "reward": 1.1206412315368652, "reward_std": 1.29878568649292, "rewards/rollout_reward_func/mean": 1.1206412315368652, "rewards/rollout_reward_func/std": 1.29878568649292, "sampling/importance_sampling_ratio/max": 1.1970775127410889, "sampling/importance_sampling_ratio/mean": 0.7073753476142883, "sampling/importance_sampling_ratio/min": 9.265180415241048e-05, "sampling/sampling_logp_difference/max": 2.0492746829986572, "sampling/sampling_logp_difference/mean": 0.3218686580657959, "step": 623, "step_time": 29.07141381400288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7655764296650887, "epoch": 0.00624, "grad_norm": 0.09441360086202621, "kl": 0.19485049322247505, "learning_rate": 7.999871992742494e-06, "loss": -0.0839, "step": 624, "step_time": 12.792831234968617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 10.6875, "completions/mean_terminated_length": 6.555555820465088, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.853126510977745, "epoch": 0.00625, "frac_reward_zero_std": 0.0, "grad_norm": 0.09585366398096085, "kl": 0.20365045871585608, "learning_rate": 7.999871556976533e-06, "loss": -0.0988, "num_tokens": 16438354.0, "reward": -0.1429901272058487, "reward_std": 1.1383366584777832, "rewards/rollout_reward_func/mean": -0.1429901272058487, "rewards/rollout_reward_func/std": 1.1383366584777832, "sampling/importance_sampling_ratio/max": 1.3399553298950195, "sampling/importance_sampling_ratio/mean": 0.38950198888778687, "sampling/importance_sampling_ratio/min": 8.84109496723795e-08, "sampling/sampling_logp_difference/max": 1.7942273616790771, "sampling/sampling_logp_difference/mean": 0.4458191990852356, "step": 625, "step_time": 32.6106644559477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.8466121703386307, "epoch": 0.00626, "grad_norm": 0.09556715190410614, "kl": 0.2156430957838893, "learning_rate": 7.999871120470122e-06, "loss": -0.0988, "step": 626, "step_time": 13.633405961983954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 7.65625, "completions/mean_terminated_length": 4.875, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5682311411947012, "epoch": 0.00627, "frac_reward_zero_std": 0.25, "grad_norm": 0.11625862866640091, "kl": 0.28405727073550224, "learning_rate": 7.999870683223264e-06, "loss": -0.0572, "num_tokens": 16491662.0, "reward": 0.736972987651825, "reward_std": 1.2623176574707031, "rewards/rollout_reward_func/mean": 0.736972987651825, "rewards/rollout_reward_func/std": 1.2623176574707031, "sampling/importance_sampling_ratio/max": 1.184605360031128, "sampling/importance_sampling_ratio/mean": 0.6983476877212524, "sampling/importance_sampling_ratio/min": 5.2227944252081215e-05, "sampling/sampling_logp_difference/max": 1.8204410076141357, "sampling/sampling_logp_difference/mean": 0.24709276854991913, "step": 627, "step_time": 38.46936595400621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5527399629354477, "epoch": 0.00628, "grad_norm": 0.108783058822155, "kl": 0.3034310173243284, "learning_rate": 7.99987024523596e-06, "loss": -0.0576, "step": 628, "step_time": 16.064678965019993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.34375, "completions/mean_terminated_length": 4.920000076293945, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8140658102929592, "epoch": 0.00629, "frac_reward_zero_std": 0.0, "grad_norm": 0.03599654510617256, "kl": 0.3847565362229943, "learning_rate": 7.999869806508206e-06, "loss": -0.1004, "num_tokens": 16549398.0, "reward": 0.3627853989601135, "reward_std": 1.1991016864776611, "rewards/rollout_reward_func/mean": 0.3627853989601135, "rewards/rollout_reward_func/std": 1.1991015672683716, "sampling/importance_sampling_ratio/max": 1.3798465728759766, "sampling/importance_sampling_ratio/mean": 0.7392640709877014, "sampling/importance_sampling_ratio/min": 1.0578761248325463e-05, "sampling/sampling_logp_difference/max": 1.6585055589675903, "sampling/sampling_logp_difference/mean": 0.33449387550354004, "step": 629, "step_time": 30.887546430021757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8037601504474878, "epoch": 0.0063, "grad_norm": 0.031592633575201035, "kl": 0.40366125851869583, "learning_rate": 7.999869367040006e-06, "loss": -0.1006, "step": 630, "step_time": 14.031721652994747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 10.75, "completions/mean_terminated_length": 6.666666507720947, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.8136186599731445, "epoch": 0.00631, "frac_reward_zero_std": 0.0, "grad_norm": 0.15747952461242676, "kl": 0.21635959018021822, "learning_rate": 7.999868926831357e-06, "loss": -0.0771, "num_tokens": 16608808.0, "reward": 0.17088907957077026, "reward_std": 1.2930090427398682, "rewards/rollout_reward_func/mean": 0.17088907957077026, "rewards/rollout_reward_func/std": 1.2930090427398682, "sampling/importance_sampling_ratio/max": 1.525652527809143, "sampling/importance_sampling_ratio/mean": 0.38720640540122986, "sampling/importance_sampling_ratio/min": 1.698004155059607e-07, "sampling/sampling_logp_difference/max": 2.7392163276672363, "sampling/sampling_logp_difference/mean": 0.46710237860679626, "step": 631, "step_time": 33.12355557200499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.793825462460518, "epoch": 0.00632, "grad_norm": 0.14094369113445282, "kl": 0.2240449795499444, "learning_rate": 7.999868485882263e-06, "loss": -0.0783, "step": 632, "step_time": 14.259676314002718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.4375, "completions/mean_terminated_length": 4.476190567016602, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8430624306201935, "epoch": 0.00633, "frac_reward_zero_std": 0.0, "grad_norm": 0.06484326720237732, "kl": 0.2952120751142502, "learning_rate": 7.999868044192721e-06, "loss": -0.103, "num_tokens": 16665452.0, "reward": 0.783236026763916, "reward_std": 1.266560673713684, "rewards/rollout_reward_func/mean": 0.783236026763916, "rewards/rollout_reward_func/std": 1.266560673713684, "sampling/importance_sampling_ratio/max": 1.4464056491851807, "sampling/importance_sampling_ratio/mean": 0.6402239203453064, "sampling/importance_sampling_ratio/min": 9.125696465162036e-07, "sampling/sampling_logp_difference/max": 1.9911404848098755, "sampling/sampling_logp_difference/mean": 0.33874672651290894, "step": 633, "step_time": 32.8044972679927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004464285913854837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "entropy": 1.8258660435676575, "epoch": 0.00634, "grad_norm": 0.028548335656523705, "kl": 0.31721014250069857, "learning_rate": 7.999867601762732e-06, "loss": -0.1032, "step": 634, "step_time": 13.074895337005728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.3125, "completions/mean_terminated_length": 5.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.840907983481884, "epoch": 0.00635, "frac_reward_zero_std": 0.25, "grad_norm": 0.07878435403108597, "kl": 0.4508793633431196, "learning_rate": 7.999867158592297e-06, "loss": -0.0734, "num_tokens": 16713427.0, "reward": 0.9776698350906372, "reward_std": 1.3437398672103882, "rewards/rollout_reward_func/mean": 0.9776698350906372, "rewards/rollout_reward_func/std": 1.3437398672103882, "sampling/importance_sampling_ratio/max": 1.5784382820129395, "sampling/importance_sampling_ratio/mean": 0.624538779258728, "sampling/importance_sampling_ratio/min": 4.608512881532079e-06, "sampling/sampling_logp_difference/max": 2.000871181488037, "sampling/sampling_logp_difference/mean": 0.3327597975730896, "step": 635, "step_time": 31.733066536005936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005681818351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "entropy": 1.8332599252462387, "epoch": 0.00636, "grad_norm": 0.06087948754429817, "kl": 0.5079739224165678, "learning_rate": 7.999866714681415e-06, "loss": -0.0736, "step": 636, "step_time": 13.351119016006123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.84375, "completions/mean_terminated_length": 4.793103218078613, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0151474950835109, "epoch": 0.00637, "frac_reward_zero_std": 0.25, "grad_norm": 0.04216570779681206, "kl": 0.2812536619603634, "learning_rate": 7.999866270030085e-06, "loss": -0.0666, "num_tokens": 16759528.0, "reward": 1.258728265762329, "reward_std": 1.1641391515731812, "rewards/rollout_reward_func/mean": 1.258728265762329, "rewards/rollout_reward_func/std": 1.1641390323638916, "sampling/importance_sampling_ratio/max": 1.4040948152542114, "sampling/importance_sampling_ratio/mean": 0.8569121360778809, "sampling/importance_sampling_ratio/min": 2.857897743524518e-05, "sampling/sampling_logp_difference/max": 1.7352159023284912, "sampling/sampling_logp_difference/mean": 0.21411697566509247, "step": 637, "step_time": 31.12601559299219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 1.0092587391845882, "epoch": 0.00638, "grad_norm": 0.04116414859890938, "kl": 0.283160119317472, "learning_rate": 7.999865824638311e-06, "loss": -0.0668, "step": 638, "step_time": 16.08394530901569 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.375, "completions/mean_terminated_length": 5.384615421295166, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.987510790117085, "epoch": 0.00639, "frac_reward_zero_std": 0.0, "grad_norm": 0.14561727643013, "kl": 0.8604615638032556, "learning_rate": 7.999865378506088e-06, "loss": -0.0476, "num_tokens": 16814433.0, "reward": 0.2906574606895447, "reward_std": 1.4461593627929688, "rewards/rollout_reward_func/mean": 0.2906574606895447, "rewards/rollout_reward_func/std": 1.4461593627929688, "sampling/importance_sampling_ratio/max": 1.384115219116211, "sampling/importance_sampling_ratio/mean": 0.7078425884246826, "sampling/importance_sampling_ratio/min": 3.852380192626015e-09, "sampling/sampling_logp_difference/max": 2.217742919921875, "sampling/sampling_logp_difference/mean": 0.41098809242248535, "step": 639, "step_time": 28.734508553985506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.980656374245882, "epoch": 0.0064, "grad_norm": 0.16356655955314636, "kl": 0.9050436103716493, "learning_rate": 7.999864931633422e-06, "loss": -0.0474, "step": 640, "step_time": 14.064887646993157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.15625, "completions/mean_terminated_length": 4.333333492279053, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2529794359579682, "epoch": 0.00641, "frac_reward_zero_std": 0.0, "grad_norm": 0.06358350813388824, "kl": 0.2699223980307579, "learning_rate": 7.999864484020307e-06, "loss": -0.0702, "num_tokens": 16870622.0, "reward": -0.03371062129735947, "reward_std": 1.2276958227157593, "rewards/rollout_reward_func/mean": -0.03371062129735947, "rewards/rollout_reward_func/std": 1.2276957035064697, "sampling/importance_sampling_ratio/max": 1.2428863048553467, "sampling/importance_sampling_ratio/mean": 0.8391226530075073, "sampling/importance_sampling_ratio/min": 0.0001234754890901968, "sampling/sampling_logp_difference/max": 1.3715897798538208, "sampling/sampling_logp_difference/mean": 0.23023749887943268, "step": 641, "step_time": 28.711015783977928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2552782758139074, "epoch": 0.00642, "grad_norm": 0.06249343603849411, "kl": 0.26905833184719086, "learning_rate": 7.999864035666747e-06, "loss": -0.0702, "step": 642, "step_time": 13.671731597016333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.71875, "completions/mean_terminated_length": 5.392857551574707, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4281063042581081, "epoch": 0.00643, "frac_reward_zero_std": 0.25, "grad_norm": 0.10456235706806183, "kl": 0.28341250121593475, "learning_rate": 7.999863586572742e-06, "loss": -0.0578, "num_tokens": 16926632.0, "reward": 0.8193034529685974, "reward_std": 1.2698644399642944, "rewards/rollout_reward_func/mean": 0.8193034529685974, "rewards/rollout_reward_func/std": 1.2698644399642944, "sampling/importance_sampling_ratio/max": 1.3968617916107178, "sampling/importance_sampling_ratio/mean": 0.7510960102081299, "sampling/importance_sampling_ratio/min": 0.0002400515804765746, "sampling/sampling_logp_difference/max": 1.618889570236206, "sampling/sampling_logp_difference/mean": 0.24417364597320557, "step": 643, "step_time": 33.88908213801915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.437938654795289, "epoch": 0.00644, "grad_norm": 0.12105990946292877, "kl": 0.2814563550055027, "learning_rate": 7.99986313673829e-06, "loss": -0.058, "step": 644, "step_time": 14.963239252014318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.8125, "completions/mean_terminated_length": 4.692307949066162, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7001366466283798, "epoch": 0.00645, "frac_reward_zero_std": 0.0, "grad_norm": 0.3143741488456726, "kl": 0.5508825536817312, "learning_rate": 7.999862686163393e-06, "loss": -0.0707, "num_tokens": 16981273.0, "reward": 0.6584415435791016, "reward_std": 1.0883922576904297, "rewards/rollout_reward_func/mean": 0.6584415435791016, "rewards/rollout_reward_func/std": 1.0883922576904297, "sampling/importance_sampling_ratio/max": 1.5033955574035645, "sampling/importance_sampling_ratio/mean": 0.6984760761260986, "sampling/importance_sampling_ratio/min": 7.4100792168962926e-09, "sampling/sampling_logp_difference/max": 1.9561543464660645, "sampling/sampling_logp_difference/mean": 0.39964210987091064, "step": 645, "step_time": 30.979963623001822 }, { "clip_ratio/high_max": 0.05422794120386243, "clip_ratio/high_mean": 0.027113970601931214, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.027113970601931214, "entropy": 1.689929038286209, "epoch": 0.00646, "grad_norm": 0.1943558305501938, "kl": 0.523704981431365, "learning_rate": 7.999862234848049e-06, "loss": -0.072, "step": 646, "step_time": 15.396653600997524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.0625, "completions/mean_terminated_length": 6.592592716217041, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2611188888549805, "epoch": 0.00647, "frac_reward_zero_std": 0.0, "grad_norm": 0.05497965216636658, "kl": 0.4972954774275422, "learning_rate": 7.99986178279226e-06, "loss": -0.0962, "num_tokens": 17032125.0, "reward": 0.784270703792572, "reward_std": 1.30219304561615, "rewards/rollout_reward_func/mean": 0.784270703792572, "rewards/rollout_reward_func/std": 1.30219304561615, "sampling/importance_sampling_ratio/max": 1.1915233135223389, "sampling/importance_sampling_ratio/mean": 0.6377763748168945, "sampling/importance_sampling_ratio/min": 3.063735221076058e-07, "sampling/sampling_logp_difference/max": 2.454148292541504, "sampling/sampling_logp_difference/mean": 0.4138449430465698, "step": 647, "step_time": 30.348580567006138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.262140244245529, "epoch": 0.00648, "grad_norm": 0.0555327869951725, "kl": 0.49628905951976776, "learning_rate": 7.999861329996028e-06, "loss": -0.0961, "step": 648, "step_time": 14.134975194974686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 4.307692527770996, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.133096414618194, "epoch": 0.00649, "frac_reward_zero_std": 0.0, "grad_norm": 0.08236321806907654, "kl": 0.7630933718755841, "learning_rate": 7.999860876459348e-06, "loss": -0.0813, "num_tokens": 17094509.0, "reward": 0.8841426372528076, "reward_std": 1.1261274814605713, "rewards/rollout_reward_func/mean": 0.8841426372528076, "rewards/rollout_reward_func/std": 1.1261274814605713, "sampling/importance_sampling_ratio/max": 1.4700440168380737, "sampling/importance_sampling_ratio/mean": 0.7882883548736572, "sampling/importance_sampling_ratio/min": 2.9451196041918593e-06, "sampling/sampling_logp_difference/max": 2.1721572875976562, "sampling/sampling_logp_difference/mean": 0.2737160921096802, "step": 649, "step_time": 31.252563147994806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1353680933825672, "epoch": 0.0065, "grad_norm": 0.0874340683221817, "kl": 0.7698811991140246, "learning_rate": 7.999860422182224e-06, "loss": -0.0812, "step": 650, "step_time": 13.897104852003395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.6875, "completions/mean_terminated_length": 4.538461685180664, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.4217216912657022, "epoch": 0.00651, "frac_reward_zero_std": 0.0, "grad_norm": 0.10300086438655853, "kl": 0.29115719720721245, "learning_rate": 7.999859967164654e-06, "loss": -0.0684, "num_tokens": 17152361.0, "reward": 0.316339910030365, "reward_std": 1.1351854801177979, "rewards/rollout_reward_func/mean": 0.316339910030365, "rewards/rollout_reward_func/std": 1.1351853609085083, "sampling/importance_sampling_ratio/max": 1.5603538751602173, "sampling/importance_sampling_ratio/mean": 0.8004910945892334, "sampling/importance_sampling_ratio/min": 1.4229341331883916e-06, "sampling/sampling_logp_difference/max": 2.2413012981414795, "sampling/sampling_logp_difference/mean": 0.25034403800964355, "step": 651, "step_time": 30.27787912999338 }, { "clip_ratio/high_max": 0.0062500000931322575, "clip_ratio/high_mean": 0.0031250000465661287, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 1.4247671030461788, "epoch": 0.00652, "grad_norm": 0.09173097461462021, "kl": 0.2867298685014248, "learning_rate": 7.99985951140664e-06, "loss": -0.0687, "step": 652, "step_time": 13.606256098981248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 6.65625, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.683627883438021, "epoch": 0.00653, "frac_reward_zero_std": 0.0, "grad_norm": 0.1730203628540039, "kl": 0.4704637695103884, "learning_rate": 7.999859054908181e-06, "loss": -0.0883, "num_tokens": 17206629.0, "reward": 0.7465319633483887, "reward_std": 1.3159879446029663, "rewards/rollout_reward_func/mean": 0.7465319633483887, "rewards/rollout_reward_func/std": 1.3159880638122559, "sampling/importance_sampling_ratio/max": 1.1652697324752808, "sampling/importance_sampling_ratio/mean": 0.7052735090255737, "sampling/importance_sampling_ratio/min": 2.7378395316191018e-05, "sampling/sampling_logp_difference/max": 1.790719747543335, "sampling/sampling_logp_difference/mean": 0.320892870426178, "step": 653, "step_time": 29.196304877026705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.6958880824968219, "epoch": 0.00654, "grad_norm": 0.15021127462387085, "kl": 0.45887053199112415, "learning_rate": 7.999858597669278e-06, "loss": -0.089, "step": 654, "step_time": 14.076561059991946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 6.3125, "completions/mean_terminated_length": 4.518518447875977, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3204670250415802, "epoch": 0.00655, "frac_reward_zero_std": 0.0, "grad_norm": 0.1550980657339096, "kl": 0.36878756806254387, "learning_rate": 7.99985813968993e-06, "loss": -0.0319, "num_tokens": 17263236.0, "reward": 0.4016118347644806, "reward_std": 1.3392983675003052, "rewards/rollout_reward_func/mean": 0.4016118347644806, "rewards/rollout_reward_func/std": 1.3392982482910156, "sampling/importance_sampling_ratio/max": 1.3785488605499268, "sampling/importance_sampling_ratio/mean": 0.866296648979187, "sampling/importance_sampling_ratio/min": 1.1888498363532563e-07, "sampling/sampling_logp_difference/max": 2.056481122970581, "sampling/sampling_logp_difference/mean": 0.3121553659439087, "step": 655, "step_time": 29.13587421302509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3435047287493944, "epoch": 0.00656, "grad_norm": 0.1755579560995102, "kl": 0.35008738189935684, "learning_rate": 7.999857680970137e-06, "loss": -0.033, "step": 656, "step_time": 14.441506809016573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 6.5625, "completions/mean_terminated_length": 4.384615421295166, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.3293782249093056, "epoch": 0.00657, "frac_reward_zero_std": 0.5, "grad_norm": 0.06684539467096329, "kl": 0.38873208686709404, "learning_rate": 7.9998572215099e-06, "loss": -0.0184, "num_tokens": 17307870.0, "reward": 0.892642080783844, "reward_std": 1.3932193517684937, "rewards/rollout_reward_func/mean": 0.892642080783844, "rewards/rollout_reward_func/std": 1.3932193517684937, "sampling/importance_sampling_ratio/max": 1.4309900999069214, "sampling/importance_sampling_ratio/mean": 0.8077204823493958, "sampling/importance_sampling_ratio/min": 2.8523865491791867e-09, "sampling/sampling_logp_difference/max": 2.325305938720703, "sampling/sampling_logp_difference/mean": 0.2602348327636719, "step": 657, "step_time": 29.957496708986582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3448691256344318, "epoch": 0.00658, "grad_norm": 0.07217561453580856, "kl": 0.3706601392477751, "learning_rate": 7.99985676130922e-06, "loss": -0.0189, "step": 658, "step_time": 14.647460726002464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.15625, "completions/mean_terminated_length": 4.333333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9078448005020618, "epoch": 0.00659, "frac_reward_zero_std": 0.25, "grad_norm": 0.012680081650614738, "kl": 0.23838340863585472, "learning_rate": 7.999856300368094e-06, "loss": -0.0401, "num_tokens": 17356263.0, "reward": 0.6416189670562744, "reward_std": 1.2594988346099854, "rewards/rollout_reward_func/mean": 0.6416189670562744, "rewards/rollout_reward_func/std": 1.2594987154006958, "sampling/importance_sampling_ratio/max": 1.2555230855941772, "sampling/importance_sampling_ratio/mean": 0.8914765119552612, "sampling/importance_sampling_ratio/min": 9.441020665690303e-05, "sampling/sampling_logp_difference/max": 1.650663137435913, "sampling/sampling_logp_difference/mean": 0.19084395468235016, "step": 659, "step_time": 28.563705527994898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9127278681844473, "epoch": 0.0066, "grad_norm": 0.013107087463140488, "kl": 0.23753569088876247, "learning_rate": 7.999855838686525e-06, "loss": -0.0401, "step": 660, "step_time": 13.56055694798124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.90625, "completions/mean_terminated_length": 5.68181848526001, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.200657770037651, "epoch": 0.00661, "frac_reward_zero_std": 0.0, "grad_norm": 0.09027326107025146, "kl": 0.22280544135719538, "learning_rate": 7.999855376264513e-06, "loss": -0.0813, "num_tokens": 17418885.0, "reward": 0.22660595178604126, "reward_std": 1.1878618001937866, "rewards/rollout_reward_func/mean": 0.22660595178604126, "rewards/rollout_reward_func/std": 1.1878618001937866, "sampling/importance_sampling_ratio/max": 1.2267323732376099, "sampling/importance_sampling_ratio/mean": 0.5706775784492493, "sampling/importance_sampling_ratio/min": 2.8041074983775616e-05, "sampling/sampling_logp_difference/max": 2.1551218032836914, "sampling/sampling_logp_difference/mean": 0.36317208409309387, "step": 661, "step_time": 34.47338726501039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.21424463391304, "epoch": 0.00662, "grad_norm": 0.08998420089483261, "kl": 0.23230242915451527, "learning_rate": 7.999854913102054e-06, "loss": -0.0813, "step": 662, "step_time": 15.967774812001153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.59375, "completions/mean_terminated_length": 4.851851940155029, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6050771288573742, "epoch": 0.00663, "frac_reward_zero_std": 0.25, "grad_norm": 0.022651631385087967, "kl": 0.6158900130540133, "learning_rate": 7.999854449199155e-06, "loss": -0.0723, "num_tokens": 17459144.0, "reward": 1.4493680000305176, "reward_std": 1.0985954999923706, "rewards/rollout_reward_func/mean": 1.4493680000305176, "rewards/rollout_reward_func/std": 1.0985954999923706, "sampling/importance_sampling_ratio/max": 1.113936185836792, "sampling/importance_sampling_ratio/mean": 0.7732599377632141, "sampling/importance_sampling_ratio/min": 2.3496478718243452e-07, "sampling/sampling_logp_difference/max": 2.3003392219543457, "sampling/sampling_logp_difference/mean": 0.28405681252479553, "step": 663, "step_time": 27.114203538018046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6055704914033413, "epoch": 0.00664, "grad_norm": 0.022096628323197365, "kl": 0.6035455632954836, "learning_rate": 7.99985398455581e-06, "loss": -0.0723, "step": 664, "step_time": 12.204913427995052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 9.15625, "completions/mean_terminated_length": 5.5714287757873535, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.612051323056221, "epoch": 0.00665, "frac_reward_zero_std": 0.0, "grad_norm": 0.13600905239582062, "kl": 0.37212478648871183, "learning_rate": 7.999853519172022e-06, "loss": -0.0719, "num_tokens": 17514799.0, "reward": -0.00029227137565612793, "reward_std": 1.269258737564087, "rewards/rollout_reward_func/mean": -0.00029227137565612793, "rewards/rollout_reward_func/std": 1.269258737564087, "sampling/importance_sampling_ratio/max": 1.2232956886291504, "sampling/importance_sampling_ratio/mean": 0.4579452574253082, "sampling/importance_sampling_ratio/min": 1.1128521038017425e-07, "sampling/sampling_logp_difference/max": 2.265765428543091, "sampling/sampling_logp_difference/mean": 0.44064080715179443, "step": 665, "step_time": 29.254326231006416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.009375000139698386, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009375000139698386, "entropy": 2.624884709715843, "epoch": 0.00666, "grad_norm": 0.05141071230173111, "kl": 0.3536948459222913, "learning_rate": 7.999853053047792e-06, "loss": -0.0726, "step": 666, "step_time": 13.735799600981409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.46875, "completions/mean_terminated_length": 4.625, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5639122165739536, "epoch": 0.00667, "frac_reward_zero_std": 0.0, "grad_norm": 0.04618940129876137, "kl": 0.2892434038221836, "learning_rate": 7.999852586183117e-06, "loss": -0.0972, "num_tokens": 17573524.0, "reward": 0.8590260744094849, "reward_std": 1.057625651359558, "rewards/rollout_reward_func/mean": 0.8590260744094849, "rewards/rollout_reward_func/std": 1.0576257705688477, "sampling/importance_sampling_ratio/max": 1.3915060758590698, "sampling/importance_sampling_ratio/mean": 0.7676440477371216, "sampling/importance_sampling_ratio/min": 3.493790791253559e-05, "sampling/sampling_logp_difference/max": 2.324225902557373, "sampling/sampling_logp_difference/mean": 0.3339575529098511, "step": 667, "step_time": 33.272612058979576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5627620331943035, "epoch": 0.00668, "grad_norm": 0.04380542412400246, "kl": 0.2860973905771971, "learning_rate": 7.999852118577999e-06, "loss": -0.0972, "step": 668, "step_time": 15.319021624003653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.0625, "completions/mean_terminated_length": 4.956521987915039, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5793946012854576, "epoch": 0.00669, "frac_reward_zero_std": 0.25, "grad_norm": 0.055710263550281525, "kl": 0.3943272475153208, "learning_rate": 7.99985165023244e-06, "loss": -0.0905, "num_tokens": 17634354.0, "reward": 0.7900428771972656, "reward_std": 1.2856645584106445, "rewards/rollout_reward_func/mean": 0.7900428771972656, "rewards/rollout_reward_func/std": 1.2856645584106445, "sampling/importance_sampling_ratio/max": 1.493416666984558, "sampling/importance_sampling_ratio/mean": 0.7030056118965149, "sampling/importance_sampling_ratio/min": 9.127972333544676e-08, "sampling/sampling_logp_difference/max": 1.9991004467010498, "sampling/sampling_logp_difference/mean": 0.2902560234069824, "step": 669, "step_time": 38.02049928402994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5796604165807366, "epoch": 0.0067, "grad_norm": 0.05150231719017029, "kl": 0.39286401960998774, "learning_rate": 7.999851181146436e-06, "loss": -0.0906, "step": 670, "step_time": 15.963207242981298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 4.5625, "completions/mean_terminated_length": 4.193548202514648, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.7044892758131027, "epoch": 0.00671, "frac_reward_zero_std": 0.5, "grad_norm": 0.2738284766674042, "kl": 0.45432937517762184, "learning_rate": 7.999850711319991e-06, "loss": -0.0024, "num_tokens": 17674981.0, "reward": 0.15984520316123962, "reward_std": 1.3044116497039795, "rewards/rollout_reward_func/mean": 0.15984520316123962, "rewards/rollout_reward_func/std": 1.30441153049469, "sampling/importance_sampling_ratio/max": 1.2862625122070312, "sampling/importance_sampling_ratio/mean": 0.9270197153091431, "sampling/importance_sampling_ratio/min": 0.00020561739802360535, "sampling/sampling_logp_difference/max": 1.3984451293945312, "sampling/sampling_logp_difference/mean": 0.1364227533340454, "step": 671, "step_time": 22.12783609201142 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.7308527529239655, "epoch": 0.00672, "grad_norm": 0.24079464375972748, "kl": 0.4619786199182272, "learning_rate": 7.999850240753102e-06, "loss": -0.004, "step": 672, "step_time": 12.104504079004982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.03125, "completions/mean_terminated_length": 4.961538791656494, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.8334562592208385, "epoch": 0.00673, "frac_reward_zero_std": 0.0, "grad_norm": 0.13360947370529175, "kl": 0.3791193375363946, "learning_rate": 7.999849769445771e-06, "loss": -0.0843, "num_tokens": 17728509.0, "reward": -0.3216079771518707, "reward_std": 0.8611318469047546, "rewards/rollout_reward_func/mean": -0.3216079771518707, "rewards/rollout_reward_func/std": 0.8611318469047546, "sampling/importance_sampling_ratio/max": 1.4175630807876587, "sampling/importance_sampling_ratio/mean": 0.6470115184783936, "sampling/importance_sampling_ratio/min": 6.219422061803925e-07, "sampling/sampling_logp_difference/max": 2.0492846965789795, "sampling/sampling_logp_difference/mean": 0.341531902551651, "step": 673, "step_time": 28.799520272004884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0034722222480922937, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034722222480922937, "entropy": 1.857850968837738, "epoch": 0.00674, "grad_norm": 0.1440771222114563, "kl": 0.375576077029109, "learning_rate": 7.999849297398e-06, "loss": -0.0844, "step": 674, "step_time": 12.609268425032496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 6.65625, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5614386722445488, "epoch": 0.00675, "frac_reward_zero_std": 0.0, "grad_norm": 0.09325826913118362, "kl": 0.35709081776440144, "learning_rate": 7.999848824609783e-06, "loss": -0.0929, "num_tokens": 17777825.0, "reward": 1.1702810525894165, "reward_std": 1.1577519178390503, "rewards/rollout_reward_func/mean": 1.1702810525894165, "rewards/rollout_reward_func/std": 1.1577517986297607, "sampling/importance_sampling_ratio/max": 1.3096308708190918, "sampling/importance_sampling_ratio/mean": 0.7664015293121338, "sampling/importance_sampling_ratio/min": 6.410811010937323e-07, "sampling/sampling_logp_difference/max": 2.556914806365967, "sampling/sampling_logp_difference/mean": 0.33221450448036194, "step": 675, "step_time": 26.16637769197405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5578899458050728, "epoch": 0.00676, "grad_norm": 0.09494364261627197, "kl": 0.3540405295789242, "learning_rate": 7.999848351081125e-06, "loss": -0.0931, "step": 676, "step_time": 12.56477858600556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.90625, "completions/mean_terminated_length": 5.190476417541504, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.5866758413612843, "epoch": 0.00677, "frac_reward_zero_std": 0.0, "grad_norm": 0.07630139589309692, "kl": 0.2094997838139534, "learning_rate": 7.999847876812025e-06, "loss": -0.0793, "num_tokens": 17840092.0, "reward": 0.0835169106721878, "reward_std": 1.1879854202270508, "rewards/rollout_reward_func/mean": 0.0835169106721878, "rewards/rollout_reward_func/std": 1.1879854202270508, "sampling/importance_sampling_ratio/max": 1.2516978979110718, "sampling/importance_sampling_ratio/mean": 0.5611451864242554, "sampling/importance_sampling_ratio/min": 5.680489678638878e-08, "sampling/sampling_logp_difference/max": 2.293269634246826, "sampling/sampling_logp_difference/mean": 0.43802106380462646, "step": 677, "step_time": 34.319850109983236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.585516717284918, "epoch": 0.00678, "grad_norm": 0.06826663762331009, "kl": 0.21687131468206644, "learning_rate": 7.999847401802484e-06, "loss": -0.0794, "step": 678, "step_time": 16.23284902301384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.375, "completions/mean_terminated_length": 5.3913044929504395, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6712626665830612, "epoch": 0.00679, "frac_reward_zero_std": 0.0, "grad_norm": 0.13218097388744354, "kl": 0.3458353839814663, "learning_rate": 7.9998469260525e-06, "loss": -0.0711, "num_tokens": 17901706.0, "reward": 0.6356695294380188, "reward_std": 1.1948175430297852, "rewards/rollout_reward_func/mean": 0.6356695294380188, "rewards/rollout_reward_func/std": 1.1948175430297852, "sampling/importance_sampling_ratio/max": 1.303985357284546, "sampling/importance_sampling_ratio/mean": 0.641461193561554, "sampling/importance_sampling_ratio/min": 9.934425179380924e-06, "sampling/sampling_logp_difference/max": 1.5823358297348022, "sampling/sampling_logp_difference/mean": 0.3009801506996155, "step": 679, "step_time": 35.30121108099411 }, { "clip_ratio/high_max": 0.0062500000931322575, "clip_ratio/high_mean": 0.0031250000465661287, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 1.667269418016076, "epoch": 0.0068, "grad_norm": 0.07264822721481323, "kl": 0.3512249831110239, "learning_rate": 7.999846449562074e-06, "loss": -0.0716, "step": 680, "step_time": 15.266090833029011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 8.53125, "completions/mean_terminated_length": 4.61904764175415, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.6349170953035355, "epoch": 0.00681, "frac_reward_zero_std": 0.0, "grad_norm": 0.08968163281679153, "kl": 0.22538292780518532, "learning_rate": 7.999845972331208e-06, "loss": -0.0784, "num_tokens": 17960031.0, "reward": 0.41835203766822815, "reward_std": 1.2108045816421509, "rewards/rollout_reward_func/mean": 0.41835203766822815, "rewards/rollout_reward_func/std": 1.2108045816421509, "sampling/importance_sampling_ratio/max": 1.4433505535125732, "sampling/importance_sampling_ratio/mean": 0.5608621835708618, "sampling/importance_sampling_ratio/min": 5.049015427971426e-09, "sampling/sampling_logp_difference/max": 2.2151150703430176, "sampling/sampling_logp_difference/mean": 0.42389747500419617, "step": 681, "step_time": 31.523638017984922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.6281674057245255, "epoch": 0.00682, "grad_norm": 0.08670251816511154, "kl": 0.2261448036879301, "learning_rate": 7.9998454943599e-06, "loss": -0.0786, "step": 682, "step_time": 15.17306975297106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005681818351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.34375, "completions/mean_terminated_length": 4.555555820465088, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0156235368922353, "epoch": 0.00683, "frac_reward_zero_std": 0.0, "grad_norm": 0.181223064661026, "kl": 1.239696817472577, "learning_rate": 7.99984501564815e-06, "loss": -0.0927, "num_tokens": 18009520.0, "reward": 1.3782908916473389, "reward_std": 1.0391947031021118, "rewards/rollout_reward_func/mean": 1.3782908916473389, "rewards/rollout_reward_func/std": 1.0391947031021118, "sampling/importance_sampling_ratio/max": 1.2709630727767944, "sampling/importance_sampling_ratio/mean": 0.8569567799568176, "sampling/importance_sampling_ratio/min": 9.079209917217668e-07, "sampling/sampling_logp_difference/max": 1.9693421125411987, "sampling/sampling_logp_difference/mean": 0.24375280737876892, "step": 683, "step_time": 30.142501207024907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0104866148903966, "epoch": 0.00684, "grad_norm": 0.13993965089321136, "kl": 1.118586715310812, "learning_rate": 7.999844536195958e-06, "loss": -0.0931, "step": 684, "step_time": 14.387333370017586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.75, "completions/mean_terminated_length": 5.454545497894287, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.270906925201416, "epoch": 0.00685, "frac_reward_zero_std": 0.0, "grad_norm": 0.04965067282319069, "kl": 0.17136357887648046, "learning_rate": 7.999844056003326e-06, "loss": -0.0661, "num_tokens": 18063902.0, "reward": 0.039190784096717834, "reward_std": 1.1602541208267212, "rewards/rollout_reward_func/mean": 0.039190784096717834, "rewards/rollout_reward_func/std": 1.1602541208267212, "sampling/importance_sampling_ratio/max": 1.2062571048736572, "sampling/importance_sampling_ratio/mean": 0.6024885177612305, "sampling/importance_sampling_ratio/min": 7.263530221734982e-08, "sampling/sampling_logp_difference/max": 2.2127084732055664, "sampling/sampling_logp_difference/mean": 0.413712739944458, "step": 685, "step_time": 29.210016914003063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.2808425277471542, "epoch": 0.00686, "grad_norm": 0.0475354827940464, "kl": 0.17098267190158367, "learning_rate": 7.999843575070253e-06, "loss": -0.0662, "step": 686, "step_time": 13.465814031980699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 8.84375, "completions/mean_terminated_length": 4.550000190734863, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.396178387105465, "epoch": 0.00687, "frac_reward_zero_std": 0.25, "grad_norm": 0.061663851141929626, "kl": 0.6759580755606294, "learning_rate": 7.99984309339674e-06, "loss": -0.0472, "num_tokens": 18120085.0, "reward": 0.5426929593086243, "reward_std": 1.325091004371643, "rewards/rollout_reward_func/mean": 0.5426929593086243, "rewards/rollout_reward_func/std": 1.3250908851623535, "sampling/importance_sampling_ratio/max": 1.195857048034668, "sampling/importance_sampling_ratio/mean": 0.5547425746917725, "sampling/importance_sampling_ratio/min": 1.027444938017652e-07, "sampling/sampling_logp_difference/max": 2.142739772796631, "sampling/sampling_logp_difference/mean": 0.45426100492477417, "step": 687, "step_time": 31.764562701020623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.396377105731517, "epoch": 0.00688, "grad_norm": 0.061675652861595154, "kl": 0.6561989453621209, "learning_rate": 7.999842610982785e-06, "loss": -0.0472, "step": 688, "step_time": 14.442695504010771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 6.5625, "completions/mean_terminated_length": 4.384615421295166, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4297446478158236, "epoch": 0.00689, "frac_reward_zero_std": 0.25, "grad_norm": 0.025829125195741653, "kl": 0.3031492745503783, "learning_rate": 7.99984212782839e-06, "loss": -0.0328, "num_tokens": 18168230.0, "reward": 0.8258727788925171, "reward_std": 1.4239524602890015, "rewards/rollout_reward_func/mean": 0.8258727788925171, "rewards/rollout_reward_func/std": 1.4239524602890015, "sampling/importance_sampling_ratio/max": 1.2027738094329834, "sampling/importance_sampling_ratio/mean": 0.7885351777076721, "sampling/importance_sampling_ratio/min": 8.321050700033084e-06, "sampling/sampling_logp_difference/max": 1.9171483516693115, "sampling/sampling_logp_difference/mean": 0.2696741223335266, "step": 689, "step_time": 23.748477106986684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4302637353539467, "epoch": 0.0069, "grad_norm": 0.026723133400082588, "kl": 0.2999172993004322, "learning_rate": 7.999841643933555e-06, "loss": -0.0327, "step": 690, "step_time": 11.709235263973824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 10.90625, "completions/mean_terminated_length": 6.94444465637207, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.991554319858551, "epoch": 0.00691, "frac_reward_zero_std": 0.0, "grad_norm": 0.13869577646255493, "kl": 0.5293432474136353, "learning_rate": 7.99984115929828e-06, "loss": -0.0597, "num_tokens": 18228106.0, "reward": 0.16318698227405548, "reward_std": 1.2528486251831055, "rewards/rollout_reward_func/mean": 0.16318698227405548, "rewards/rollout_reward_func/std": 1.2528486251831055, "sampling/importance_sampling_ratio/max": 1.170549750328064, "sampling/importance_sampling_ratio/mean": 0.2645339369773865, "sampling/importance_sampling_ratio/min": 1.3351698271435453e-06, "sampling/sampling_logp_difference/max": 2.268315315246582, "sampling/sampling_logp_difference/mean": 0.4685291647911072, "step": 691, "step_time": 33.40081276901765 }, { "clip_ratio/high_max": 0.0280303037725389, "clip_ratio/high_mean": 0.016732543474063277, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.016732543474063277, "entropy": 2.9968014657497406, "epoch": 0.00692, "grad_norm": 0.10041728615760803, "kl": 0.47159106750041246, "learning_rate": 7.999840673922561e-06, "loss": -0.06, "step": 692, "step_time": 14.54405911701906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.84375, "completions/mean_terminated_length": 4.65217399597168, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.742619164288044, "epoch": 0.00693, "frac_reward_zero_std": 0.25, "grad_norm": 0.09644969552755356, "kl": 0.22383402567356825, "learning_rate": 7.999840187806405e-06, "loss": -0.0765, "num_tokens": 18278541.0, "reward": 0.4999968409538269, "reward_std": 1.3106366395950317, "rewards/rollout_reward_func/mean": 0.4999968409538269, "rewards/rollout_reward_func/std": 1.3106366395950317, "sampling/importance_sampling_ratio/max": 1.1666936874389648, "sampling/importance_sampling_ratio/mean": 0.671108603477478, "sampling/importance_sampling_ratio/min": 3.6392666515894234e-06, "sampling/sampling_logp_difference/max": 2.0325570106506348, "sampling/sampling_logp_difference/mean": 0.29217198491096497, "step": 693, "step_time": 29.698656836000737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7416581101715565, "epoch": 0.00694, "grad_norm": 0.10117317736148834, "kl": 0.22428411711007357, "learning_rate": 7.999839700949809e-06, "loss": -0.0767, "step": 694, "step_time": 12.343486309982836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.40625, "completions/mean_terminated_length": 5.952381134033203, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.4402382522821426, "epoch": 0.00695, "frac_reward_zero_std": 0.0, "grad_norm": 0.12102081626653671, "kl": 0.2613533865660429, "learning_rate": 7.999839213352772e-06, "loss": -0.0796, "num_tokens": 18338984.0, "reward": 0.23858611285686493, "reward_std": 1.2481145858764648, "rewards/rollout_reward_func/mean": 0.23858611285686493, "rewards/rollout_reward_func/std": 1.2481145858764648, "sampling/importance_sampling_ratio/max": 1.346107006072998, "sampling/importance_sampling_ratio/mean": 0.5512927174568176, "sampling/importance_sampling_ratio/min": 1.8702813804338803e-06, "sampling/sampling_logp_difference/max": 2.0976240634918213, "sampling/sampling_logp_difference/mean": 0.43550050258636475, "step": 695, "step_time": 30.265909591020318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.43429134786129, "epoch": 0.00696, "grad_norm": 0.1218322142958641, "kl": 0.2713182084262371, "learning_rate": 7.999838725015296e-06, "loss": -0.0797, "step": 696, "step_time": 13.579559570003767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.34375, "completions/mean_terminated_length": 5.34615421295166, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.90840582549572, "epoch": 0.00697, "frac_reward_zero_std": 0.0, "grad_norm": 0.14441128075122833, "kl": 0.7093100873753428, "learning_rate": 7.99983823593738e-06, "loss": -0.0639, "num_tokens": 18397008.0, "reward": -0.34885865449905396, "reward_std": 1.0505441427230835, "rewards/rollout_reward_func/mean": -0.34885865449905396, "rewards/rollout_reward_func/std": 1.0505441427230835, "sampling/importance_sampling_ratio/max": 1.334766149520874, "sampling/importance_sampling_ratio/mean": 0.6308809518814087, "sampling/importance_sampling_ratio/min": 3.2951550110738026e-06, "sampling/sampling_logp_difference/max": 2.5955328941345215, "sampling/sampling_logp_difference/mean": 0.364055871963501, "step": 697, "step_time": 32.81324152498564 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.9029325991868973, "epoch": 0.00698, "grad_norm": 0.17393764853477478, "kl": 0.6196648683398962, "learning_rate": 7.999837746119025e-06, "loss": -0.065, "step": 698, "step_time": 14.174759636021918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.3125, "completions/mean_terminated_length": 4.518518447875977, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4966228250414133, "epoch": 0.00699, "frac_reward_zero_std": 0.0, "grad_norm": 0.4438904821872711, "kl": 1.8005859199911356, "learning_rate": 7.99983725556023e-06, "loss": -0.025, "num_tokens": 18451078.0, "reward": 0.3285694420337677, "reward_std": 1.3331395387649536, "rewards/rollout_reward_func/mean": 0.3285694420337677, "rewards/rollout_reward_func/std": 1.3331396579742432, "sampling/importance_sampling_ratio/max": 1.500417947769165, "sampling/importance_sampling_ratio/mean": 0.7768444418907166, "sampling/importance_sampling_ratio/min": 3.721531584233162e-06, "sampling/sampling_logp_difference/max": 1.8461861610412598, "sampling/sampling_logp_difference/mean": 0.3081498146057129, "step": 699, "step_time": 36.77585310502036 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.014226973755285144, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.019908792106434703, "entropy": 1.5586056504398584, "epoch": 0.007, "grad_norm": 0.2018701285123825, "kl": 1.612832985818386, "learning_rate": 7.999836764260995e-06, "loss": -0.0281, "step": 700, "step_time": 19.160214008996263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.5625, "completions/mean_terminated_length": 4.814815044403076, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4424433652311563, "epoch": 0.00701, "frac_reward_zero_std": 0.25, "grad_norm": 0.11808998882770538, "kl": 0.24648798443377018, "learning_rate": 7.999836272221323e-06, "loss": -0.0866, "num_tokens": 18503690.0, "reward": 1.3846426010131836, "reward_std": 1.020728349685669, "rewards/rollout_reward_func/mean": 1.3846426010131836, "rewards/rollout_reward_func/std": 1.020728349685669, "sampling/importance_sampling_ratio/max": 1.529990315437317, "sampling/importance_sampling_ratio/mean": 0.8319835662841797, "sampling/importance_sampling_ratio/min": 1.3128194041200913e-05, "sampling/sampling_logp_difference/max": 1.940659761428833, "sampling/sampling_logp_difference/mean": 0.2802959382534027, "step": 701, "step_time": 29.63014090897923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4812288098037243, "epoch": 0.00702, "grad_norm": 0.14294029772281647, "kl": 0.239945650100708, "learning_rate": 7.99983577944121e-06, "loss": -0.0857, "step": 702, "step_time": 14.943314111995278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.875, "completions/mean_terminated_length": 5.599999904632568, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.111725091934204, "epoch": 0.00703, "frac_reward_zero_std": 0.0, "grad_norm": 0.06986527889966965, "kl": 0.3216505143791437, "learning_rate": 7.999835285920659e-06, "loss": -0.0644, "num_tokens": 18556930.0, "reward": 0.6486001014709473, "reward_std": 1.3003491163253784, "rewards/rollout_reward_func/mean": 0.6486001014709473, "rewards/rollout_reward_func/std": 1.3003491163253784, "sampling/importance_sampling_ratio/max": 1.270921230316162, "sampling/importance_sampling_ratio/mean": 0.6013314723968506, "sampling/importance_sampling_ratio/min": 2.5657923288235907e-06, "sampling/sampling_logp_difference/max": 2.2990798950195312, "sampling/sampling_logp_difference/mean": 0.38463687896728516, "step": 703, "step_time": 30.513512229983462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.1148136258125305, "epoch": 0.00704, "grad_norm": 0.0718684121966362, "kl": 0.31837565265595913, "learning_rate": 7.999834791659668e-06, "loss": -0.0643, "step": 704, "step_time": 13.71649127699493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 6.09375, "completions/mean_terminated_length": 4.259259223937988, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3742950558662415, "epoch": 0.00705, "frac_reward_zero_std": 0.0, "grad_norm": 0.14125144481658936, "kl": 0.23316055536270142, "learning_rate": 7.99983429665824e-06, "loss": -0.0843, "num_tokens": 18597752.0, "reward": 0.7082647085189819, "reward_std": 1.2363892793655396, "rewards/rollout_reward_func/mean": 0.7082647085189819, "rewards/rollout_reward_func/std": 1.2363892793655396, "sampling/importance_sampling_ratio/max": 1.1784532070159912, "sampling/importance_sampling_ratio/mean": 0.8505103588104248, "sampling/importance_sampling_ratio/min": 6.370203209371539e-07, "sampling/sampling_logp_difference/max": 2.011737108230591, "sampling/sampling_logp_difference/mean": 0.33201396465301514, "step": 705, "step_time": 23.855746800021734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.369351614266634, "epoch": 0.00706, "grad_norm": 0.1401980072259903, "kl": 0.23384618666023016, "learning_rate": 7.999833800916372e-06, "loss": -0.0844, "step": 706, "step_time": 12.350920256008976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.0625, "completions/mean_terminated_length": 5.839999675750732, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.874149203300476, "epoch": 0.00707, "frac_reward_zero_std": 0.0, "grad_norm": 0.06796678155660629, "kl": 0.186422286555171, "learning_rate": 7.999833304434066e-06, "loss": -0.0312, "num_tokens": 18642369.0, "reward": 0.1963234692811966, "reward_std": 1.4172605276107788, "rewards/rollout_reward_func/mean": 0.1963234692811966, "rewards/rollout_reward_func/std": 1.4172605276107788, "sampling/importance_sampling_ratio/max": 1.178633213043213, "sampling/importance_sampling_ratio/mean": 0.6364352107048035, "sampling/importance_sampling_ratio/min": 1.3418747585092206e-05, "sampling/sampling_logp_difference/max": 2.0882105827331543, "sampling/sampling_logp_difference/mean": 0.29921361804008484, "step": 707, "step_time": 26.516916055988986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8676570858806372, "epoch": 0.00708, "grad_norm": 0.07273492962121964, "kl": 0.1899759960360825, "learning_rate": 7.999832807211321e-06, "loss": -0.031, "step": 708, "step_time": 11.701246138021816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.5625, "completions/mean_terminated_length": 5.615385055541992, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8460537865757942, "epoch": 0.00709, "frac_reward_zero_std": 0.0, "grad_norm": 0.19216008484363556, "kl": 0.28033375460654497, "learning_rate": 7.99983230924814e-06, "loss": -0.0839, "num_tokens": 18694585.0, "reward": 0.028245538473129272, "reward_std": 1.1498222351074219, "rewards/rollout_reward_func/mean": 0.028245538473129272, "rewards/rollout_reward_func/std": 1.1498222351074219, "sampling/importance_sampling_ratio/max": 1.3240563869476318, "sampling/importance_sampling_ratio/mean": 0.6347771883010864, "sampling/importance_sampling_ratio/min": 2.992681402247399e-05, "sampling/sampling_logp_difference/max": 1.9116053581237793, "sampling/sampling_logp_difference/mean": 0.30793362855911255, "step": 709, "step_time": 28.53245126098045 }, { "clip_ratio/high_max": 0.00657894741743803, "clip_ratio/high_mean": 0.003289473708719015, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003289473708719015, "entropy": 1.8267197012901306, "epoch": 0.0071, "grad_norm": 0.15065909922122955, "kl": 0.28552178479731083, "learning_rate": 7.999831810544519e-06, "loss": -0.0849, "step": 710, "step_time": 13.708216963044833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 11.28125, "completions/mean_terminated_length": 7.117647171020508, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.1406287103891373, "epoch": 0.00711, "frac_reward_zero_std": 0.0, "grad_norm": 0.09388461709022522, "kl": 0.18533609388396144, "learning_rate": 7.99983131110046e-06, "loss": -0.0843, "num_tokens": 18761355.0, "reward": -0.17023681104183197, "reward_std": 1.1667367219924927, "rewards/rollout_reward_func/mean": -0.17023681104183197, "rewards/rollout_reward_func/std": 1.1667367219924927, "sampling/importance_sampling_ratio/max": 1.3709690570831299, "sampling/importance_sampling_ratio/mean": 0.3150596618652344, "sampling/importance_sampling_ratio/min": 7.946736246822184e-08, "sampling/sampling_logp_difference/max": 2.145303726196289, "sampling/sampling_logp_difference/mean": 0.5123950242996216, "step": 711, "step_time": 39.843413967988454 }, { "clip_ratio/high_max": 0.0059523810632526875, "clip_ratio/high_mean": 0.0029761905316263437, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0029761905316263437, "entropy": 3.119664877653122, "epoch": 0.00712, "grad_norm": 0.08199343085289001, "kl": 0.17725031077861786, "learning_rate": 7.999830810915965e-06, "loss": -0.0848, "step": 712, "step_time": 18.896606835987768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 6.1875, "completions/mean_terminated_length": 4.370370388031006, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1593983052298427, "epoch": 0.00713, "frac_reward_zero_std": 0.0, "grad_norm": 0.06741143763065338, "kl": 0.3878532024100423, "learning_rate": 7.999830309991031e-06, "loss": -0.0862, "num_tokens": 18807760.0, "reward": 1.2038357257843018, "reward_std": 1.2148981094360352, "rewards/rollout_reward_func/mean": 1.2038357257843018, "rewards/rollout_reward_func/std": 1.2148981094360352, "sampling/importance_sampling_ratio/max": 1.2672289609909058, "sampling/importance_sampling_ratio/mean": 0.8288447856903076, "sampling/importance_sampling_ratio/min": 2.5425870262552053e-05, "sampling/sampling_logp_difference/max": 2.290285587310791, "sampling/sampling_logp_difference/mean": 0.24803629517555237, "step": 713, "step_time": 23.872261034004623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1506635462865233, "epoch": 0.00714, "grad_norm": 0.058513984084129333, "kl": 0.41291956324130297, "learning_rate": 7.99982980832566e-06, "loss": -0.0866, "step": 714, "step_time": 12.173839336988749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 9.4375, "completions/mean_terminated_length": 4.947368621826172, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.3499869629740715, "epoch": 0.00715, "frac_reward_zero_std": 0.0, "grad_norm": 0.10818594694137573, "kl": 0.5857682321220636, "learning_rate": 7.999829305919849e-06, "loss": -0.0579, "num_tokens": 18866290.0, "reward": -0.11941967904567719, "reward_std": 1.228766918182373, "rewards/rollout_reward_func/mean": -0.11941967904567719, "rewards/rollout_reward_func/std": 1.2287667989730835, "sampling/importance_sampling_ratio/max": 1.389941692352295, "sampling/importance_sampling_ratio/mean": 0.5010366439819336, "sampling/importance_sampling_ratio/min": 2.701681523831212e-06, "sampling/sampling_logp_difference/max": 2.724846363067627, "sampling/sampling_logp_difference/mean": 0.4397875666618347, "step": 715, "step_time": 31.62022402897128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.333559574559331, "epoch": 0.00716, "grad_norm": 0.09967156499624252, "kl": 0.528564459644258, "learning_rate": 7.999828802773603e-06, "loss": -0.0581, "step": 716, "step_time": 13.757294450013433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 5.333333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6826229970902205, "epoch": 0.00717, "frac_reward_zero_std": 0.25, "grad_norm": 0.6130708456039429, "kl": 0.2718375762924552, "learning_rate": 7.99982829888692e-06, "loss": -0.0576, "num_tokens": 18920511.0, "reward": 0.46214064955711365, "reward_std": 1.2092294692993164, "rewards/rollout_reward_func/mean": 0.46214064955711365, "rewards/rollout_reward_func/std": 1.2092293500900269, "sampling/importance_sampling_ratio/max": 1.422136902809143, "sampling/importance_sampling_ratio/mean": 0.7935926914215088, "sampling/importance_sampling_ratio/min": 1.9750020783249056e-06, "sampling/sampling_logp_difference/max": 1.9762097597122192, "sampling/sampling_logp_difference/mean": 0.3119150400161743, "step": 717, "step_time": 31.84252565598581 }, { "clip_ratio/high_max": 0.017613636795431376, "clip_ratio/high_mean": 0.008806818397715688, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015056818490847945, "entropy": 1.6542950700968504, "epoch": 0.00718, "grad_norm": 0.10015282034873962, "kl": 0.2718841889873147, "learning_rate": 7.9998277942598e-06, "loss": -0.0596, "step": 718, "step_time": 15.000502192022395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 7.65625, "completions/mean_terminated_length": 4.3913044929504395, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.0386974224820733, "epoch": 0.00719, "frac_reward_zero_std": 0.0, "grad_norm": 0.031481705605983734, "kl": 0.20303408429026604, "learning_rate": 7.99982728889224e-06, "loss": -0.087, "num_tokens": 18973637.0, "reward": 0.6680791974067688, "reward_std": 1.3878172636032104, "rewards/rollout_reward_func/mean": 0.6680791974067688, "rewards/rollout_reward_func/std": 1.3878173828125, "sampling/importance_sampling_ratio/max": 1.341963291168213, "sampling/importance_sampling_ratio/mean": 0.762903094291687, "sampling/importance_sampling_ratio/min": 7.450388039842437e-08, "sampling/sampling_logp_difference/max": 2.1448237895965576, "sampling/sampling_logp_difference/mean": 0.4287559390068054, "step": 719, "step_time": 29.465748554997845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.030035282485187, "epoch": 0.0072, "grad_norm": 0.021605461835861206, "kl": 0.20311388885602355, "learning_rate": 7.999826782784247e-06, "loss": -0.0871, "step": 720, "step_time": 13.81636755602085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.875, "completions/mean_terminated_length": 4.769230842590332, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2720885928720236, "epoch": 0.00721, "frac_reward_zero_std": 0.0, "grad_norm": 0.060691237449645996, "kl": 0.28485568054020405, "learning_rate": 7.999826275935814e-06, "loss": -0.06, "num_tokens": 19025445.0, "reward": 0.060136668384075165, "reward_std": 1.3600972890853882, "rewards/rollout_reward_func/mean": 0.060136668384075165, "rewards/rollout_reward_func/std": 1.3600972890853882, "sampling/importance_sampling_ratio/max": 1.239727258682251, "sampling/importance_sampling_ratio/mean": 0.7866131067276001, "sampling/importance_sampling_ratio/min": 2.4403272618656047e-05, "sampling/sampling_logp_difference/max": 1.737372875213623, "sampling/sampling_logp_difference/mean": 0.28349876403808594, "step": 721, "step_time": 26.303957789990818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2675058636814356, "epoch": 0.00722, "grad_norm": 0.05837105214595795, "kl": 0.294960368424654, "learning_rate": 7.999825768346947e-06, "loss": -0.0599, "step": 722, "step_time": 14.25503227201989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.78125, "completions/mean_terminated_length": 5.884615421295166, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.059856154024601, "epoch": 0.00723, "frac_reward_zero_std": 0.0, "grad_norm": 0.07965004444122314, "kl": 0.3686538580805063, "learning_rate": 7.999825260017642e-06, "loss": -0.049, "num_tokens": 19084317.0, "reward": -0.05741661787033081, "reward_std": 1.0609748363494873, "rewards/rollout_reward_func/mean": -0.05741661787033081, "rewards/rollout_reward_func/std": 1.0609748363494873, "sampling/importance_sampling_ratio/max": 1.4869745969772339, "sampling/importance_sampling_ratio/mean": 0.6495122909545898, "sampling/importance_sampling_ratio/min": 3.4959568573356137e-09, "sampling/sampling_logp_difference/max": 2.5898139476776123, "sampling/sampling_logp_difference/mean": 0.43461179733276367, "step": 723, "step_time": 33.06588975399791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.052191149443388, "epoch": 0.00724, "grad_norm": 0.08032410591840744, "kl": 0.35683669056743383, "learning_rate": 7.999824750947901e-06, "loss": -0.0489, "step": 724, "step_time": 15.670636729002581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.15625, "completions/mean_terminated_length": 4.679999828338623, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6332378461956978, "epoch": 0.00725, "frac_reward_zero_std": 0.0, "grad_norm": 0.16249917447566986, "kl": 0.6702070087194443, "learning_rate": 7.999824241137725e-06, "loss": -0.059, "num_tokens": 19136645.0, "reward": 0.7364047169685364, "reward_std": 1.3889684677124023, "rewards/rollout_reward_func/mean": 0.7364047169685364, "rewards/rollout_reward_func/std": 1.3889684677124023, "sampling/importance_sampling_ratio/max": 1.9961885213851929, "sampling/importance_sampling_ratio/mean": 0.7504544258117676, "sampling/importance_sampling_ratio/min": 4.401146895816055e-07, "sampling/sampling_logp_difference/max": 1.8639603853225708, "sampling/sampling_logp_difference/mean": 0.3403165936470032, "step": 725, "step_time": 29.194490669004153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6372350524179637, "epoch": 0.00726, "grad_norm": 0.15829920768737793, "kl": 0.6704529132694006, "learning_rate": 7.999823730587112e-06, "loss": -0.0591, "step": 726, "step_time": 15.857060829992406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 7.71875, "completions/mean_terminated_length": 4.958333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3052786886692047, "epoch": 0.00727, "frac_reward_zero_std": 0.0, "grad_norm": 0.06226386874914169, "kl": 0.5125619079917669, "learning_rate": 7.999823219296063e-06, "loss": -0.0635, "num_tokens": 19192041.0, "reward": 0.970501184463501, "reward_std": 1.257084846496582, "rewards/rollout_reward_func/mean": 0.970501184463501, "rewards/rollout_reward_func/std": 1.257084846496582, "sampling/importance_sampling_ratio/max": 1.120801568031311, "sampling/importance_sampling_ratio/mean": 0.6371317505836487, "sampling/importance_sampling_ratio/min": 4.343364139458572e-07, "sampling/sampling_logp_difference/max": 1.8575254678726196, "sampling/sampling_logp_difference/mean": 0.2224467396736145, "step": 727, "step_time": 40.326880068998435 }, { "clip_ratio/high_max": 0.01657197019085288, "clip_ratio/high_mean": 0.00828598509542644, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00828598509542644, "entropy": 1.311161708086729, "epoch": 0.00728, "grad_norm": 0.06806939095258713, "kl": 0.4619088377803564, "learning_rate": 7.999822707264577e-06, "loss": -0.0636, "step": 728, "step_time": 18.292642529006116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.9375, "completions/mean_terminated_length": 4.2727274894714355, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.1370134502649307, "epoch": 0.00729, "frac_reward_zero_std": 0.0, "grad_norm": 0.10865052789449692, "kl": 0.3190316688269377, "learning_rate": 7.999822194492657e-06, "loss": -0.0945, "num_tokens": 19248777.0, "reward": 0.21521137654781342, "reward_std": 1.1624102592468262, "rewards/rollout_reward_func/mean": 0.21521137654781342, "rewards/rollout_reward_func/std": 1.1624101400375366, "sampling/importance_sampling_ratio/max": 1.3665860891342163, "sampling/importance_sampling_ratio/mean": 0.6683429479598999, "sampling/importance_sampling_ratio/min": 2.2251970221986994e-06, "sampling/sampling_logp_difference/max": 1.7503962516784668, "sampling/sampling_logp_difference/mean": 0.3704124391078949, "step": 729, "step_time": 30.204017683980055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.146403893828392, "epoch": 0.0073, "grad_norm": 0.10298073291778564, "kl": 0.3223718162626028, "learning_rate": 7.999821680980302e-06, "loss": -0.0945, "step": 730, "step_time": 13.564102244010428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 6.625, "completions/mean_terminated_length": 4.461538791656494, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1901442548260093, "epoch": 0.00731, "frac_reward_zero_std": 0.25, "grad_norm": 0.05526804178953171, "kl": 0.24442230351269245, "learning_rate": 7.999821166727509e-06, "loss": -0.0597, "num_tokens": 19299637.0, "reward": 0.9586682319641113, "reward_std": 1.1801453828811646, "rewards/rollout_reward_func/mean": 0.9586682319641113, "rewards/rollout_reward_func/std": 1.1801453828811646, "sampling/importance_sampling_ratio/max": 1.3858586549758911, "sampling/importance_sampling_ratio/mean": 0.8024545907974243, "sampling/importance_sampling_ratio/min": 5.1395844025137194e-08, "sampling/sampling_logp_difference/max": 2.1433582305908203, "sampling/sampling_logp_difference/mean": 0.28711116313934326, "step": 731, "step_time": 26.4550699449901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1935209594666958, "epoch": 0.00732, "grad_norm": 0.05699488893151283, "kl": 0.24843459948897362, "learning_rate": 7.999820651734282e-06, "loss": -0.0597, "step": 732, "step_time": 12.981443060009042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.21875, "completions/mean_terminated_length": 4.8214287757873535, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0406939014792442, "epoch": 0.00733, "frac_reward_zero_std": 0.25, "grad_norm": 0.03609244152903557, "kl": 0.38530114479362965, "learning_rate": 7.99982013600062e-06, "loss": -0.0613, "num_tokens": 19350524.0, "reward": 0.7509827017784119, "reward_std": 1.2021442651748657, "rewards/rollout_reward_func/mean": 0.7509827017784119, "rewards/rollout_reward_func/std": 1.2021441459655762, "sampling/importance_sampling_ratio/max": 1.4316041469573975, "sampling/importance_sampling_ratio/mean": 0.8476299047470093, "sampling/importance_sampling_ratio/min": 1.2988442904315889e-05, "sampling/sampling_logp_difference/max": 1.860129714012146, "sampling/sampling_logp_difference/mean": 0.23137962818145752, "step": 733, "step_time": 27.16123034901102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0425691865384579, "epoch": 0.00734, "grad_norm": 0.036025840789079666, "kl": 0.38973426446318626, "learning_rate": 7.999819619526523e-06, "loss": -0.0613, "step": 734, "step_time": 12.373224976006895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 7.8125, "completions/mean_terminated_length": 4.6086955070495605, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.829863621853292, "epoch": 0.00735, "frac_reward_zero_std": 0.0, "grad_norm": 0.1514539271593094, "kl": 0.30826508067548275, "learning_rate": 7.999819102311991e-06, "loss": -0.0649, "num_tokens": 19413367.0, "reward": 0.25321483612060547, "reward_std": 1.309950351715088, "rewards/rollout_reward_func/mean": 0.25321483612060547, "rewards/rollout_reward_func/std": 1.309950351715088, "sampling/importance_sampling_ratio/max": 1.5012062788009644, "sampling/importance_sampling_ratio/mean": 0.6910206079483032, "sampling/importance_sampling_ratio/min": 7.549615020252531e-06, "sampling/sampling_logp_difference/max": 2.1615593433380127, "sampling/sampling_logp_difference/mean": 0.36920469999313354, "step": 735, "step_time": 37.48081586501212 }, { "clip_ratio/high_max": 0.0059523810632526875, "clip_ratio/high_mean": 0.0029761905316263437, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0029761905316263437, "entropy": 1.8464602436870337, "epoch": 0.00736, "grad_norm": 0.13407792150974274, "kl": 0.28318879194557667, "learning_rate": 7.999818584357024e-06, "loss": -0.0656, "step": 736, "step_time": 18.806628584017744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.96875, "completions/mean_terminated_length": 5.71999979019165, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8120180116966367, "epoch": 0.00737, "frac_reward_zero_std": 0.0, "grad_norm": 0.18142762780189514, "kl": 0.3034185813739896, "learning_rate": 7.999818065661623e-06, "loss": -0.0832, "num_tokens": 19474897.0, "reward": 0.5365925431251526, "reward_std": 1.29862380027771, "rewards/rollout_reward_func/mean": 0.5365925431251526, "rewards/rollout_reward_func/std": 1.29862380027771, "sampling/importance_sampling_ratio/max": 1.852797508239746, "sampling/importance_sampling_ratio/mean": 0.760027289390564, "sampling/importance_sampling_ratio/min": 1.4008910511620343e-06, "sampling/sampling_logp_difference/max": 2.7551565170288086, "sampling/sampling_logp_difference/mean": 0.35509687662124634, "step": 737, "step_time": 31.50357706102659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8163820449262857, "epoch": 0.00738, "grad_norm": 0.18394778668880463, "kl": 0.3014743449166417, "learning_rate": 7.999817546225787e-06, "loss": -0.0835, "step": 738, "step_time": 13.783062258022255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.4375, "completions/mean_terminated_length": 5.4782609939575195, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.0563800036907196, "epoch": 0.00739, "frac_reward_zero_std": 0.25, "grad_norm": 0.061927974224090576, "kl": 0.4477659519761801, "learning_rate": 7.999817026049515e-06, "loss": -0.0499, "num_tokens": 19532870.0, "reward": 0.8769447803497314, "reward_std": 1.2228113412857056, "rewards/rollout_reward_func/mean": 0.8769447803497314, "rewards/rollout_reward_func/std": 1.2228113412857056, "sampling/importance_sampling_ratio/max": 1.2974456548690796, "sampling/importance_sampling_ratio/mean": 0.6219838857650757, "sampling/importance_sampling_ratio/min": 5.225006134423893e-07, "sampling/sampling_logp_difference/max": 2.5615155696868896, "sampling/sampling_logp_difference/mean": 0.3989697992801666, "step": 739, "step_time": 33.787076314969454 }, { "clip_ratio/high_max": 0.008333333767950535, "clip_ratio/high_mean": 0.004166666883975267, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004166666883975267, "entropy": 2.0568784810602665, "epoch": 0.0074, "grad_norm": 0.056841347366571426, "kl": 0.42610039189457893, "learning_rate": 7.99981650513281e-06, "loss": -0.0499, "step": 740, "step_time": 14.287121907997061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 10.53125, "completions/mean_terminated_length": 6.277777671813965, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.5432837456464767, "epoch": 0.00741, "frac_reward_zero_std": 0.0, "grad_norm": 0.05662758648395538, "kl": 0.2063814178109169, "learning_rate": 7.999815983475671e-06, "loss": -0.1022, "num_tokens": 19599199.0, "reward": 0.10463027656078339, "reward_std": 1.2161940336227417, "rewards/rollout_reward_func/mean": 0.10463027656078339, "rewards/rollout_reward_func/std": 1.2161940336227417, "sampling/importance_sampling_ratio/max": 1.3364921808242798, "sampling/importance_sampling_ratio/mean": 0.4634537696838379, "sampling/importance_sampling_ratio/min": 1.6646552580823482e-07, "sampling/sampling_logp_difference/max": 2.2758944034576416, "sampling/sampling_logp_difference/mean": 0.45115339756011963, "step": 741, "step_time": 36.23295858998608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.551026403903961, "epoch": 0.00742, "grad_norm": 0.057029686868190765, "kl": 0.2063273098319769, "learning_rate": 7.999815461078097e-06, "loss": -0.1023, "step": 742, "step_time": 16.415702514976147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.34375, "completions/mean_terminated_length": 5.34782600402832, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.223296634852886, "epoch": 0.00743, "frac_reward_zero_std": 0.25, "grad_norm": 0.16271567344665527, "kl": 0.6305246511474252, "learning_rate": 7.999814937940092e-06, "loss": -0.071, "num_tokens": 19652718.0, "reward": 0.628913402557373, "reward_std": 1.3777379989624023, "rewards/rollout_reward_func/mean": 0.628913402557373, "rewards/rollout_reward_func/std": 1.3777379989624023, "sampling/importance_sampling_ratio/max": 1.6116464138031006, "sampling/importance_sampling_ratio/mean": 0.684368371963501, "sampling/importance_sampling_ratio/min": 4.898426997357319e-09, "sampling/sampling_logp_difference/max": 2.036705732345581, "sampling/sampling_logp_difference/mean": 0.398326575756073, "step": 743, "step_time": 28.71152134699514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.2258921563625336, "epoch": 0.00744, "grad_norm": 0.16175967454910278, "kl": 0.6321671037003398, "learning_rate": 7.99981441406165e-06, "loss": -0.0715, "step": 744, "step_time": 13.52155732200481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.0625, "completions/mean_terminated_length": 5.909090995788574, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.458941176533699, "epoch": 0.00745, "frac_reward_zero_std": 0.0, "grad_norm": 0.1366049349308014, "kl": 0.1371338851749897, "learning_rate": 7.999813889442775e-06, "loss": -0.1221, "num_tokens": 19710036.0, "reward": 0.5077036023139954, "reward_std": 1.342676043510437, "rewards/rollout_reward_func/mean": 0.5077036023139954, "rewards/rollout_reward_func/std": 1.3426759243011475, "sampling/importance_sampling_ratio/max": 1.4192668199539185, "sampling/importance_sampling_ratio/mean": 0.6729488372802734, "sampling/importance_sampling_ratio/min": 4.786280882740357e-08, "sampling/sampling_logp_difference/max": 2.489642381668091, "sampling/sampling_logp_difference/mean": 0.4140784740447998, "step": 745, "step_time": 34.969614421977894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.4612516313791275, "epoch": 0.00746, "grad_norm": 0.10977066308259964, "kl": 0.13860783260315657, "learning_rate": 7.999813364083468e-06, "loss": -0.123, "step": 746, "step_time": 15.121106571008568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.65625, "completions/mean_terminated_length": 4.290322303771973, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.36478533782064915, "epoch": 0.00747, "frac_reward_zero_std": 0.25, "grad_norm": 0.04537973552942276, "kl": 0.7176229301840067, "learning_rate": 7.999812837983725e-06, "loss": -0.0492, "num_tokens": 19745522.0, "reward": 1.9456766843795776, "reward_std": 0.08628658950328827, "rewards/rollout_reward_func/mean": 1.9456766843795776, "rewards/rollout_reward_func/std": 0.08628657460212708, "sampling/importance_sampling_ratio/max": 1.1762365102767944, "sampling/importance_sampling_ratio/mean": 0.9773753881454468, "sampling/importance_sampling_ratio/min": 1.8336477296543308e-05, "sampling/sampling_logp_difference/max": 1.6779839992523193, "sampling/sampling_logp_difference/mean": 0.11811913549900055, "step": 747, "step_time": 12.430949346991838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 0.36345590837299824, "epoch": 0.00748, "grad_norm": 0.030264222994446754, "kl": 0.8407746069133282, "learning_rate": 7.99981231114355e-06, "loss": -0.0493, "step": 748, "step_time": 6.7449603989953175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 8.53125, "completions/mean_terminated_length": 5.608695983886719, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.5536510944366455, "epoch": 0.00749, "frac_reward_zero_std": 0.0, "grad_norm": 0.12056390196084976, "kl": 0.9872182253748178, "learning_rate": 7.999811783562942e-06, "loss": -0.1012, "num_tokens": 19811281.0, "reward": 0.504772424697876, "reward_std": 1.2637864351272583, "rewards/rollout_reward_func/mean": 0.504772424697876, "rewards/rollout_reward_func/std": 1.2637864351272583, "sampling/importance_sampling_ratio/max": 1.4043024778366089, "sampling/importance_sampling_ratio/mean": 0.4821319878101349, "sampling/importance_sampling_ratio/min": 3.474134791758843e-05, "sampling/sampling_logp_difference/max": 1.7683744430541992, "sampling/sampling_logp_difference/mean": 0.40705519914627075, "step": 749, "step_time": 32.68513988799532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.5589689388871193, "epoch": 0.0075, "grad_norm": 0.12854792177677155, "kl": 1.0211855471134186, "learning_rate": 7.9998112552419e-06, "loss": -0.1012, "step": 750, "step_time": 14.177961144014262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.03125, "completions/mean_terminated_length": 4.185185432434082, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9937938149087131, "epoch": 0.00751, "frac_reward_zero_std": 0.0, "grad_norm": 0.1055150255560875, "kl": 0.6984025379642844, "learning_rate": 7.999810726180428e-06, "loss": -0.0674, "num_tokens": 19859790.0, "reward": 0.7106163501739502, "reward_std": 1.272475242614746, "rewards/rollout_reward_func/mean": 0.7106163501739502, "rewards/rollout_reward_func/std": 1.2724751234054565, "sampling/importance_sampling_ratio/max": 1.4564851522445679, "sampling/importance_sampling_ratio/mean": 0.861685574054718, "sampling/importance_sampling_ratio/min": 7.307839496206725e-06, "sampling/sampling_logp_difference/max": 1.875012755393982, "sampling/sampling_logp_difference/mean": 0.23231816291809082, "step": 751, "step_time": 28.042334290003055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9941504541784525, "epoch": 0.00752, "grad_norm": 0.10313276946544647, "kl": 0.6863156110048294, "learning_rate": 7.99981019637852e-06, "loss": -0.0675, "step": 752, "step_time": 13.573522215985577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.40625, "completions/mean_terminated_length": 4.5416669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6765875145792961, "epoch": 0.00753, "frac_reward_zero_std": 0.25, "grad_norm": 0.0352669321000576, "kl": 0.25866834446787834, "learning_rate": 7.999809665836181e-06, "loss": -0.0762, "num_tokens": 19915267.0, "reward": 1.0106585025787354, "reward_std": 1.1781666278839111, "rewards/rollout_reward_func/mean": 1.0106585025787354, "rewards/rollout_reward_func/std": 1.1781667470932007, "sampling/importance_sampling_ratio/max": 1.6172109842300415, "sampling/importance_sampling_ratio/mean": 0.7764466404914856, "sampling/importance_sampling_ratio/min": 5.2126111427241995e-08, "sampling/sampling_logp_difference/max": 1.7771540880203247, "sampling/sampling_logp_difference/mean": 0.3594900965690613, "step": 753, "step_time": 30.496500797962653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6777285262942314, "epoch": 0.00754, "grad_norm": 0.03598443418741226, "kl": 0.25588724948465824, "learning_rate": 7.999809134553408e-06, "loss": -0.0762, "step": 754, "step_time": 14.19903211001656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 5.1875, "completions/mean_terminated_length": 4.838709354400635, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9465903183445334, "epoch": 0.00755, "frac_reward_zero_std": 0.25, "grad_norm": 0.053162772208452225, "kl": 1.0918358154594898, "learning_rate": 7.999808602530205e-06, "loss": -0.0629, "num_tokens": 19954718.0, "reward": 1.6617202758789062, "reward_std": 0.8505350351333618, "rewards/rollout_reward_func/mean": 1.6617202758789062, "rewards/rollout_reward_func/std": 0.8505350947380066, "sampling/importance_sampling_ratio/max": 1.115900993347168, "sampling/importance_sampling_ratio/mean": 0.8734108209609985, "sampling/importance_sampling_ratio/min": 6.558431323355762e-06, "sampling/sampling_logp_difference/max": 1.7962366342544556, "sampling/sampling_logp_difference/mean": 0.2505466938018799, "step": 755, "step_time": 18.885320108980522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9500075271353126, "epoch": 0.00756, "grad_norm": 0.05053522810339928, "kl": 1.0719207618385553, "learning_rate": 7.99980806976657e-06, "loss": -0.0629, "step": 756, "step_time": 10.501252969988855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 9.625, "completions/mean_terminated_length": 7.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.798364445567131, "epoch": 0.00757, "frac_reward_zero_std": 0.0, "grad_norm": 0.052862491458654404, "kl": 0.27361831068992615, "learning_rate": 7.9998075362625e-06, "loss": -0.0879, "num_tokens": 20012758.0, "reward": 0.12128615379333496, "reward_std": 1.378861904144287, "rewards/rollout_reward_func/mean": 0.12128615379333496, "rewards/rollout_reward_func/std": 1.3788617849349976, "sampling/importance_sampling_ratio/max": 1.2963680028915405, "sampling/importance_sampling_ratio/mean": 0.40813013911247253, "sampling/importance_sampling_ratio/min": 4.1980740661529126e-07, "sampling/sampling_logp_difference/max": 2.4210662841796875, "sampling/sampling_logp_difference/mean": 0.46332335472106934, "step": 757, "step_time": 33.93171712002368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.8035174161195755, "epoch": 0.00758, "grad_norm": 0.053434357047080994, "kl": 0.26836300548166037, "learning_rate": 7.999807002018e-06, "loss": -0.0879, "step": 758, "step_time": 15.742718152003363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 6.96875, "completions/mean_terminated_length": 4.440000057220459, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4001493752002716, "epoch": 0.00759, "frac_reward_zero_std": 0.0, "grad_norm": 0.14058953523635864, "kl": 0.42750298231840134, "learning_rate": 7.999806467033068e-06, "loss": -0.0782, "num_tokens": 20067191.0, "reward": 1.061539888381958, "reward_std": 1.225516676902771, "rewards/rollout_reward_func/mean": 1.061539888381958, "rewards/rollout_reward_func/std": 1.225516676902771, "sampling/importance_sampling_ratio/max": 1.337383508682251, "sampling/importance_sampling_ratio/mean": 0.7886685132980347, "sampling/importance_sampling_ratio/min": 1.5083118341863155e-05, "sampling/sampling_logp_difference/max": 1.8800866603851318, "sampling/sampling_logp_difference/mean": 0.2743118107318878, "step": 759, "step_time": 30.38399331198889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4056002385914326, "epoch": 0.0076, "grad_norm": 0.1544615477323532, "kl": 0.4273244719952345, "learning_rate": 7.999805931307704e-06, "loss": -0.0784, "step": 760, "step_time": 14.176772909006104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.09375, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.226790349930525, "epoch": 0.00761, "frac_reward_zero_std": 0.25, "grad_norm": 0.030049389228224754, "kl": 0.2734915819019079, "learning_rate": 7.99980539484191e-06, "loss": -0.0802, "num_tokens": 20118239.0, "reward": 0.593418300151825, "reward_std": 1.4107009172439575, "rewards/rollout_reward_func/mean": 0.593418300151825, "rewards/rollout_reward_func/std": 1.4107009172439575, "sampling/importance_sampling_ratio/max": 1.286253571510315, "sampling/importance_sampling_ratio/mean": 0.5985358357429504, "sampling/importance_sampling_ratio/min": 1.3620288541460468e-07, "sampling/sampling_logp_difference/max": 3.0026917457580566, "sampling/sampling_logp_difference/mean": 0.4011768400669098, "step": 761, "step_time": 36.44166367898288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.2261842638254166, "epoch": 0.00762, "grad_norm": 0.030259665101766586, "kl": 0.2843352574855089, "learning_rate": 7.99980485763568e-06, "loss": -0.0801, "step": 762, "step_time": 15.654699698017794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 6.28125, "completions/mean_terminated_length": 4.481481552124023, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0727082900702953, "epoch": 0.00763, "frac_reward_zero_std": 0.5, "grad_norm": 0.0552058145403862, "kl": 0.3717994838953018, "learning_rate": 7.999804319689023e-06, "loss": -0.0313, "num_tokens": 20170084.0, "reward": 1.0646259784698486, "reward_std": 1.2283382415771484, "rewards/rollout_reward_func/mean": 1.0646259784698486, "rewards/rollout_reward_func/std": 1.2283382415771484, "sampling/importance_sampling_ratio/max": 1.5959532260894775, "sampling/importance_sampling_ratio/mean": 0.8361546993255615, "sampling/importance_sampling_ratio/min": 2.0222641978762113e-06, "sampling/sampling_logp_difference/max": 1.8922964334487915, "sampling/sampling_logp_difference/mean": 0.22778558731079102, "step": 763, "step_time": 33.502525085015805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0765387378633022, "epoch": 0.00764, "grad_norm": 0.06682363152503967, "kl": 0.3617520071566105, "learning_rate": 7.999803781001934e-06, "loss": -0.0314, "step": 764, "step_time": 14.734678046996123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.6875, "completions/mean_terminated_length": 6.142857551574707, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.753394678235054, "epoch": 0.00765, "frac_reward_zero_std": 0.0, "grad_norm": 0.07892967015504837, "kl": 0.15634644497185946, "learning_rate": 7.999803241574412e-06, "loss": -0.1006, "num_tokens": 20232370.0, "reward": 0.041187092661857605, "reward_std": 1.2963039875030518, "rewards/rollout_reward_func/mean": 0.041187092661857605, "rewards/rollout_reward_func/std": 1.2963038682937622, "sampling/importance_sampling_ratio/max": 1.1856458187103271, "sampling/importance_sampling_ratio/mean": 0.3152437210083008, "sampling/importance_sampling_ratio/min": 1.970654892602397e-07, "sampling/sampling_logp_difference/max": 2.110276222229004, "sampling/sampling_logp_difference/mean": 0.42518651485443115, "step": 765, "step_time": 32.09442860999843 }, { "clip_ratio/high_max": 0.004310344811528921, "clip_ratio/high_mean": 0.0021551724057644606, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0021551724057644606, "entropy": 2.757061168551445, "epoch": 0.00766, "grad_norm": 0.08451461791992188, "kl": 0.15654065134003758, "learning_rate": 7.999802701406462e-06, "loss": -0.1006, "step": 766, "step_time": 13.265874866949162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.34375, "completions/mean_terminated_length": 4.458333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.925946183502674, "epoch": 0.00767, "frac_reward_zero_std": 0.25, "grad_norm": 0.09406567364931107, "kl": 0.19305978063493967, "learning_rate": 7.99980216049808e-06, "loss": -0.079, "num_tokens": 20284715.0, "reward": 0.8702121376991272, "reward_std": 1.3285281658172607, "rewards/rollout_reward_func/mean": 0.8702121376991272, "rewards/rollout_reward_func/std": 1.3285281658172607, "sampling/importance_sampling_ratio/max": 1.2827680110931396, "sampling/importance_sampling_ratio/mean": 0.7467526197433472, "sampling/importance_sampling_ratio/min": 2.2919437459023584e-09, "sampling/sampling_logp_difference/max": 3.1951520442962646, "sampling/sampling_logp_difference/mean": 0.35427895188331604, "step": 767, "step_time": 27.805186992001836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9317767918109894, "epoch": 0.00768, "grad_norm": 0.09666401147842407, "kl": 0.1943506971001625, "learning_rate": 7.999801618849266e-06, "loss": -0.0791, "step": 768, "step_time": 13.474453414004529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.40625, "completions/mean_terminated_length": 4.310344696044922, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8836905276402831, "epoch": 0.00769, "frac_reward_zero_std": 0.25, "grad_norm": 0.0769524946808815, "kl": 0.3617009185254574, "learning_rate": 7.999801076460023e-06, "loss": -0.0483, "num_tokens": 20320836.0, "reward": 1.1805856227874756, "reward_std": 1.3062174320220947, "rewards/rollout_reward_func/mean": 1.1805856227874756, "rewards/rollout_reward_func/std": 1.3062175512313843, "sampling/importance_sampling_ratio/max": 1.1676454544067383, "sampling/importance_sampling_ratio/mean": 0.8853520154953003, "sampling/importance_sampling_ratio/min": 8.11200061434647e-06, "sampling/sampling_logp_difference/max": 1.982320785522461, "sampling/sampling_logp_difference/mean": 0.20456230640411377, "step": 769, "step_time": 20.548161067054025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8752041906118393, "epoch": 0.0077, "grad_norm": 0.07373743504285812, "kl": 0.37512430362403393, "learning_rate": 7.999800533330349e-06, "loss": -0.0483, "step": 770, "step_time": 11.295850069975131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.375, "completions/mean_terminated_length": 5.833333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.379193678498268, "epoch": 0.00771, "frac_reward_zero_std": 0.0, "grad_norm": 0.17457027733325958, "kl": 1.029720464721322, "learning_rate": 7.999799989460245e-06, "loss": -0.03, "num_tokens": 20382345.0, "reward": 0.20196452736854553, "reward_std": 1.1240208148956299, "rewards/rollout_reward_func/mean": 0.20196452736854553, "rewards/rollout_reward_func/std": 1.1240208148956299, "sampling/importance_sampling_ratio/max": 1.34200918674469, "sampling/importance_sampling_ratio/mean": 0.4726600646972656, "sampling/importance_sampling_ratio/min": 1.0195426369818961e-07, "sampling/sampling_logp_difference/max": 2.1703062057495117, "sampling/sampling_logp_difference/mean": 0.420458048582077, "step": 771, "step_time": 33.46473675900779 }, { "clip_ratio/high_max": 0.0390625, "clip_ratio/high_mean": 0.01953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01953125, "entropy": 2.3669066056609154, "epoch": 0.00772, "grad_norm": 0.13040238618850708, "kl": 0.9147299216128886, "learning_rate": 7.999799444849711e-06, "loss": -0.0313, "step": 772, "step_time": 15.565940072992817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 9.09375, "completions/mean_terminated_length": 4.3684210777282715, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.224881574511528, "epoch": 0.00773, "frac_reward_zero_std": 0.0, "grad_norm": 0.07051952928304672, "kl": 0.1744283251464367, "learning_rate": 7.999798899498748e-06, "loss": -0.0312, "num_tokens": 20441053.0, "reward": -0.5739054679870605, "reward_std": 0.6456372141838074, "rewards/rollout_reward_func/mean": -0.5739054679870605, "rewards/rollout_reward_func/std": 0.6456372141838074, "sampling/importance_sampling_ratio/max": 1.2802693843841553, "sampling/importance_sampling_ratio/mean": 0.518051028251648, "sampling/importance_sampling_ratio/min": 1.1140871720272116e-05, "sampling/sampling_logp_difference/max": 1.6990787982940674, "sampling/sampling_logp_difference/mean": 0.3691486716270447, "step": 773, "step_time": 34.49784322800406 }, { "clip_ratio/high_max": 0.016666667070239782, "clip_ratio/high_mean": 0.008333333535119891, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008333333535119891, "entropy": 2.201055236160755, "epoch": 0.00774, "grad_norm": 0.06198858469724655, "kl": 0.16680359467864037, "learning_rate": 7.999798353407354e-06, "loss": -0.0315, "step": 774, "step_time": 15.307090212983894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 10.0625, "completions/mean_terminated_length": 5.44444465637207, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.800249621272087, "epoch": 0.00775, "frac_reward_zero_std": 0.0, "grad_norm": 0.18112485110759735, "kl": 0.21673500118777156, "learning_rate": 7.99979780657553e-06, "loss": -0.0785, "num_tokens": 20495578.0, "reward": -0.39720025658607483, "reward_std": 1.0289080142974854, "rewards/rollout_reward_func/mean": -0.39720025658607483, "rewards/rollout_reward_func/std": 1.0289080142974854, "sampling/importance_sampling_ratio/max": 1.5042715072631836, "sampling/importance_sampling_ratio/mean": 0.47267770767211914, "sampling/importance_sampling_ratio/min": 1.5596614844071155e-07, "sampling/sampling_logp_difference/max": 2.0484957695007324, "sampling/sampling_logp_difference/mean": 0.4833751320838928, "step": 775, "step_time": 27.62407816799532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.79612635076046, "epoch": 0.00776, "grad_norm": 0.17293627560138702, "kl": 0.21409169305115938, "learning_rate": 7.999797259003276e-06, "loss": -0.079, "step": 776, "step_time": 13.058139947999734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.71875, "completions/mean_terminated_length": 4.958333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.220235526561737, "epoch": 0.00777, "frac_reward_zero_std": 0.0, "grad_norm": 0.0392063669860363, "kl": 0.48371568135917187, "learning_rate": 7.999796710690594e-06, "loss": -0.083, "num_tokens": 20550564.0, "reward": 0.6019676923751831, "reward_std": 1.3789271116256714, "rewards/rollout_reward_func/mean": 0.6019676923751831, "rewards/rollout_reward_func/std": 1.3789271116256714, "sampling/importance_sampling_ratio/max": 1.2713077068328857, "sampling/importance_sampling_ratio/mean": 0.6578399538993835, "sampling/importance_sampling_ratio/min": 2.339966613362776e-06, "sampling/sampling_logp_difference/max": 1.9894646406173706, "sampling/sampling_logp_difference/mean": 0.3901767134666443, "step": 777, "step_time": 28.966315866986406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.2192612811923027, "epoch": 0.00778, "grad_norm": 0.03816106170415878, "kl": 0.4758562631905079, "learning_rate": 7.999796161637481e-06, "loss": -0.0831, "step": 778, "step_time": 13.53265756397741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.59375, "completions/mean_terminated_length": 5.227272987365723, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.748962864279747, "epoch": 0.00779, "frac_reward_zero_std": 0.0, "grad_norm": 0.16843527555465698, "kl": 0.2203731806948781, "learning_rate": 7.99979561184394e-06, "loss": -0.0633, "num_tokens": 20610974.0, "reward": -0.23239004611968994, "reward_std": 1.0389105081558228, "rewards/rollout_reward_func/mean": -0.23239004611968994, "rewards/rollout_reward_func/std": 1.0389105081558228, "sampling/importance_sampling_ratio/max": 1.2761423587799072, "sampling/importance_sampling_ratio/mean": 0.4692695736885071, "sampling/importance_sampling_ratio/min": 1.4853326035790815e-07, "sampling/sampling_logp_difference/max": 2.1681885719299316, "sampling/sampling_logp_difference/mean": 0.4568655490875244, "step": 779, "step_time": 33.26536235299136 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 2.7409749180078506, "epoch": 0.0078, "grad_norm": 0.07353700697422028, "kl": 0.2156283580698073, "learning_rate": 7.99979506130997e-06, "loss": -0.0641, "step": 780, "step_time": 16.426947896994534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.34375, "completions/mean_terminated_length": 5.34615421295166, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.816550076007843, "epoch": 0.00781, "frac_reward_zero_std": 0.0, "grad_norm": 0.058630477637052536, "kl": 0.4490524362772703, "learning_rate": 7.999794510035571e-06, "loss": -0.0542, "num_tokens": 20664874.0, "reward": 0.7155901193618774, "reward_std": 1.3007421493530273, "rewards/rollout_reward_func/mean": 0.7155901193618774, "rewards/rollout_reward_func/std": 1.3007421493530273, "sampling/importance_sampling_ratio/max": 1.487971544265747, "sampling/importance_sampling_ratio/mean": 0.7493809461593628, "sampling/importance_sampling_ratio/min": 6.76283065104144e-08, "sampling/sampling_logp_difference/max": 2.411142349243164, "sampling/sampling_logp_difference/mean": 0.35101965069770813, "step": 781, "step_time": 28.856759589034482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8150685131549835, "epoch": 0.00782, "grad_norm": 0.0608384869992733, "kl": 0.43302392959594727, "learning_rate": 7.999793958020743e-06, "loss": -0.0542, "step": 782, "step_time": 12.546748805994866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 4.159999847412109, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.634852496907115, "epoch": 0.00783, "frac_reward_zero_std": 0.5, "grad_norm": 0.014159632846713066, "kl": 0.21583707816898823, "learning_rate": 7.999793405265487e-06, "loss": -0.0452, "num_tokens": 20716296.0, "reward": 1.1360094547271729, "reward_std": 1.237601637840271, "rewards/rollout_reward_func/mean": 1.1360094547271729, "rewards/rollout_reward_func/std": 1.237601637840271, "sampling/importance_sampling_ratio/max": 1.1500853300094604, "sampling/importance_sampling_ratio/mean": 0.7968487739562988, "sampling/importance_sampling_ratio/min": 3.1416867329880915e-08, "sampling/sampling_logp_difference/max": 2.148597240447998, "sampling/sampling_logp_difference/mean": 0.3539029061794281, "step": 783, "step_time": 30.821096758008935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6327811889350414, "epoch": 0.00784, "grad_norm": 0.013053926639258862, "kl": 0.2159451935440302, "learning_rate": 7.999792851769802e-06, "loss": -0.0452, "step": 784, "step_time": 15.408724346008967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.71875, "completions/mean_terminated_length": 5.400000095367432, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.575386706739664, "epoch": 0.00785, "frac_reward_zero_std": 0.0, "grad_norm": 0.04870741814374924, "kl": 0.45270927157253027, "learning_rate": 7.99979229753369e-06, "loss": -0.0781, "num_tokens": 20771709.0, "reward": 0.4755091071128845, "reward_std": 1.2612783908843994, "rewards/rollout_reward_func/mean": 0.4755091071128845, "rewards/rollout_reward_func/std": 1.2612783908843994, "sampling/importance_sampling_ratio/max": 1.2408044338226318, "sampling/importance_sampling_ratio/mean": 0.6680436134338379, "sampling/importance_sampling_ratio/min": 0.00011240856110816821, "sampling/sampling_logp_difference/max": 1.7386430501937866, "sampling/sampling_logp_difference/mean": 0.2932116985321045, "step": 785, "step_time": 38.109517973003676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.573380945250392, "epoch": 0.00786, "grad_norm": 0.047127362340688705, "kl": 0.4399175336584449, "learning_rate": 7.999791742557149e-06, "loss": -0.0782, "step": 786, "step_time": 16.78615477202402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.53125, "completions/mean_terminated_length": 6.0416669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.303202025592327, "epoch": 0.00787, "frac_reward_zero_std": 0.0, "grad_norm": 0.2138357162475586, "kl": 0.3011426078155637, "learning_rate": 7.999791186840177e-06, "loss": -0.0735, "num_tokens": 20829801.0, "reward": 0.08466595411300659, "reward_std": 1.0785294771194458, "rewards/rollout_reward_func/mean": 0.08466595411300659, "rewards/rollout_reward_func/std": 1.0785294771194458, "sampling/importance_sampling_ratio/max": 1.599027395248413, "sampling/importance_sampling_ratio/mean": 0.5911856889724731, "sampling/importance_sampling_ratio/min": 5.057314638179378e-07, "sampling/sampling_logp_difference/max": 2.447781562805176, "sampling/sampling_logp_difference/mean": 0.4256308674812317, "step": 787, "step_time": 36.889610350976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.2950199134647846, "epoch": 0.00788, "grad_norm": 0.16359174251556396, "kl": 0.29035926051437855, "learning_rate": 7.99979063038278e-06, "loss": -0.0745, "step": 788, "step_time": 16.23193911400449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.9375, "completions/mean_terminated_length": 5.679999828338623, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.842388316988945, "epoch": 0.00789, "frac_reward_zero_std": 0.0, "grad_norm": 0.056684572249650955, "kl": 0.2822648659348488, "learning_rate": 7.999790073184955e-06, "loss": -0.0753, "num_tokens": 20887595.0, "reward": 0.44059744477272034, "reward_std": 1.3387961387634277, "rewards/rollout_reward_func/mean": 0.44059744477272034, "rewards/rollout_reward_func/std": 1.3387961387634277, "sampling/importance_sampling_ratio/max": 1.7878074645996094, "sampling/importance_sampling_ratio/mean": 0.7066695690155029, "sampling/importance_sampling_ratio/min": 2.7559593718251563e-07, "sampling/sampling_logp_difference/max": 2.646181583404541, "sampling/sampling_logp_difference/mean": 0.4165910482406616, "step": 789, "step_time": 28.074638553982368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8374291062355042, "epoch": 0.0079, "grad_norm": 0.05781964585185051, "kl": 0.28072041366249323, "learning_rate": 7.999789515246702e-06, "loss": -0.0752, "step": 790, "step_time": 13.69672654097667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 9.59375, "completions/mean_terminated_length": 5.210526466369629, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.6873771101236343, "epoch": 0.00791, "frac_reward_zero_std": 0.0, "grad_norm": 0.10634049773216248, "kl": 0.16921798046678305, "learning_rate": 7.999788956568022e-06, "loss": -0.0586, "num_tokens": 20944287.0, "reward": 0.04713734984397888, "reward_std": 1.131844401359558, "rewards/rollout_reward_func/mean": 0.04713734984397888, "rewards/rollout_reward_func/std": 1.131844401359558, "sampling/importance_sampling_ratio/max": 1.2785204648971558, "sampling/importance_sampling_ratio/mean": 0.55671226978302, "sampling/importance_sampling_ratio/min": 3.69932031674125e-08, "sampling/sampling_logp_difference/max": 2.123196601867676, "sampling/sampling_logp_difference/mean": 0.43517324328422546, "step": 791, "step_time": 33.57376625301549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.6861216723918915, "epoch": 0.00792, "grad_norm": 0.10737816989421844, "kl": 0.1680076690390706, "learning_rate": 7.999788397148915e-06, "loss": -0.059, "step": 792, "step_time": 13.669199023963301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.84375, "completions/mean_terminated_length": 4.730769634246826, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4993671840056777, "epoch": 0.00793, "frac_reward_zero_std": 0.25, "grad_norm": 0.023817837238311768, "kl": 0.29020537994802, "learning_rate": 7.99978783698938e-06, "loss": -0.0403, "num_tokens": 20992385.0, "reward": 0.9368084073066711, "reward_std": 1.1723334789276123, "rewards/rollout_reward_func/mean": 0.9368084073066711, "rewards/rollout_reward_func/std": 1.1723334789276123, "sampling/importance_sampling_ratio/max": 1.3797085285186768, "sampling/importance_sampling_ratio/mean": 0.8059276342391968, "sampling/importance_sampling_ratio/min": 3.5459395348880207e-06, "sampling/sampling_logp_difference/max": 1.8241485357284546, "sampling/sampling_logp_difference/mean": 0.3143259286880493, "step": 793, "step_time": 26.190667966002366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4974998477846384, "epoch": 0.00794, "grad_norm": 0.02509940229356289, "kl": 0.29004910588264465, "learning_rate": 7.999787276089417e-06, "loss": -0.0402, "step": 794, "step_time": 12.384977911991882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.09375, "completions/mean_terminated_length": 4.950000286102295, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.22191190905869, "epoch": 0.00795, "frac_reward_zero_std": 0.0, "grad_norm": 0.03648506477475166, "kl": 0.2609394518658519, "learning_rate": 7.999786714449027e-06, "loss": -0.0923, "num_tokens": 21050446.0, "reward": 0.11083370447158813, "reward_std": 1.2830543518066406, "rewards/rollout_reward_func/mean": 0.11083370447158813, "rewards/rollout_reward_func/std": 1.283054232597351, "sampling/importance_sampling_ratio/max": 1.3186023235321045, "sampling/importance_sampling_ratio/mean": 0.5747926235198975, "sampling/importance_sampling_ratio/min": 7.84755016525196e-08, "sampling/sampling_logp_difference/max": 1.9850215911865234, "sampling/sampling_logp_difference/mean": 0.4057513475418091, "step": 795, "step_time": 28.853796771974885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.2197031653486192, "epoch": 0.00796, "grad_norm": 0.03303394466638565, "kl": 0.24315810203552246, "learning_rate": 7.999786152068212e-06, "loss": -0.0924, "step": 796, "step_time": 12.700276359988493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.5530486442148685, "epoch": 0.00797, "frac_reward_zero_std": 0.75, "grad_norm": 0.14485643804073334, "kl": 0.24463555868715048, "learning_rate": 7.999785588946968e-06, "loss": -0.0111, "num_tokens": 21086154.0, "reward": 1.2674862146377563, "reward_std": 1.2992089986801147, "rewards/rollout_reward_func/mean": 1.2674862146377563, "rewards/rollout_reward_func/std": 1.2992091178894043, "sampling/importance_sampling_ratio/max": 1.416534662246704, "sampling/importance_sampling_ratio/mean": 1.0243535041809082, "sampling/importance_sampling_ratio/min": 1.653450271987822e-05, "sampling/sampling_logp_difference/max": 2.0183677673339844, "sampling/sampling_logp_difference/mean": 0.13512484729290009, "step": 797, "step_time": 21.661423793018912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5488722994923592, "epoch": 0.00798, "grad_norm": 0.13833604753017426, "kl": 0.2458822876214981, "learning_rate": 7.999785025085299e-06, "loss": -0.0115, "step": 798, "step_time": 12.08071093900071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.28125, "completions/mean_terminated_length": 4.238095283508301, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.141081228852272, "epoch": 0.00799, "frac_reward_zero_std": 0.25, "grad_norm": 0.10135247558355331, "kl": 0.2508438602089882, "learning_rate": 7.999784460483204e-06, "loss": -0.0762, "num_tokens": 21138401.0, "reward": 0.14454254508018494, "reward_std": 1.405651330947876, "rewards/rollout_reward_func/mean": 0.14454254508018494, "rewards/rollout_reward_func/std": 1.405651330947876, "sampling/importance_sampling_ratio/max": 1.3870667219161987, "sampling/importance_sampling_ratio/mean": 0.6682336330413818, "sampling/importance_sampling_ratio/min": 1.4803181613842753e-07, "sampling/sampling_logp_difference/max": 1.957672357559204, "sampling/sampling_logp_difference/mean": 0.38781046867370605, "step": 799, "step_time": 29.347629192008753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 2.133545368909836, "epoch": 0.008, "grad_norm": 0.08938322216272354, "kl": 0.26387784257531166, "learning_rate": 7.999783895140683e-06, "loss": -0.0764, "step": 800, "step_time": 14.48512492000009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 9.125, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.6746728867292404, "epoch": 0.00801, "frac_reward_zero_std": 0.0, "grad_norm": 0.08602066338062286, "kl": 0.15697600319981575, "learning_rate": 7.999783329057734e-06, "loss": -0.1067, "num_tokens": 21191473.0, "reward": 0.547858715057373, "reward_std": 1.3770290613174438, "rewards/rollout_reward_func/mean": 0.547858715057373, "rewards/rollout_reward_func/std": 1.3770290613174438, "sampling/importance_sampling_ratio/max": 1.499642252922058, "sampling/importance_sampling_ratio/mean": 0.5942437648773193, "sampling/importance_sampling_ratio/min": 2.7727233842256283e-09, "sampling/sampling_logp_difference/max": 2.4861018657684326, "sampling/sampling_logp_difference/mean": 0.4583587646484375, "step": 801, "step_time": 32.12646480597323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.6719762682914734, "epoch": 0.00802, "grad_norm": 0.07699159532785416, "kl": 0.1614753631874919, "learning_rate": 7.99978276223436e-06, "loss": -0.1069, "step": 802, "step_time": 13.573425587004749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.125, "completions/mean_terminated_length": 5.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.3141599521040916, "epoch": 0.00803, "frac_reward_zero_std": 0.25, "grad_norm": 0.10097987204790115, "kl": 0.51224678568542, "learning_rate": 7.99978219467056e-06, "loss": -0.03, "num_tokens": 21243771.0, "reward": 0.794008731842041, "reward_std": 1.444628119468689, "rewards/rollout_reward_func/mean": 0.794008731842041, "rewards/rollout_reward_func/std": 1.4446280002593994, "sampling/importance_sampling_ratio/max": 1.5039373636245728, "sampling/importance_sampling_ratio/mean": 0.6509023308753967, "sampling/importance_sampling_ratio/min": 5.29074739574753e-09, "sampling/sampling_logp_difference/max": 2.387165069580078, "sampling/sampling_logp_difference/mean": 0.47210821509361267, "step": 803, "step_time": 29.36843090399634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.003289473708719015, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003289473708719015, "entropy": 2.3113590329885483, "epoch": 0.00804, "grad_norm": 0.10189768671989441, "kl": 0.5203666696324944, "learning_rate": 7.999781626366335e-06, "loss": -0.0299, "step": 804, "step_time": 14.594597250994411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 6.125, "completions/mean_terminated_length": 4.296296119689941, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.350913293659687, "epoch": 0.00805, "frac_reward_zero_std": 0.5, "grad_norm": 0.02898293174803257, "kl": 0.30919216200709343, "learning_rate": 7.999781057321683e-06, "loss": -0.0484, "num_tokens": 21290840.0, "reward": 1.0304594039916992, "reward_std": 1.3308967351913452, "rewards/rollout_reward_func/mean": 1.0304594039916992, "rewards/rollout_reward_func/std": 1.3308967351913452, "sampling/importance_sampling_ratio/max": 1.3512154817581177, "sampling/importance_sampling_ratio/mean": 0.8096973896026611, "sampling/importance_sampling_ratio/min": 1.2997507070622305e-08, "sampling/sampling_logp_difference/max": 1.8870054483413696, "sampling/sampling_logp_difference/mean": 0.3028547465801239, "step": 805, "step_time": 27.29827671799285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3475945508107543, "epoch": 0.00806, "grad_norm": 0.029095754027366638, "kl": 0.3099211733788252, "learning_rate": 7.999780487536607e-06, "loss": -0.0484, "step": 806, "step_time": 13.468836083979113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.09375, "completions/mean_terminated_length": 5.476190567016602, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.9839907325804234, "epoch": 0.00807, "frac_reward_zero_std": 0.0, "grad_norm": 0.13844512403011322, "kl": 0.4720021989196539, "learning_rate": 7.999779917011105e-06, "loss": -0.0896, "num_tokens": 21354633.0, "reward": 0.42317745089530945, "reward_std": 1.2850596904754639, "rewards/rollout_reward_func/mean": 0.42317745089530945, "rewards/rollout_reward_func/std": 1.2850596904754639, "sampling/importance_sampling_ratio/max": 1.4344249963760376, "sampling/importance_sampling_ratio/mean": 0.6026705503463745, "sampling/importance_sampling_ratio/min": 3.2920812031989044e-07, "sampling/sampling_logp_difference/max": 2.2552008628845215, "sampling/sampling_logp_difference/mean": 0.38766270875930786, "step": 807, "step_time": 35.82450533901283 }, { "clip_ratio/high_max": 0.004310344811528921, "clip_ratio/high_mean": 0.0021551724057644606, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00996767240576446, "entropy": 1.9861151203513145, "epoch": 0.00808, "grad_norm": 0.1237243264913559, "kl": 0.4368342459201813, "learning_rate": 7.999779345745178e-06, "loss": -0.0899, "step": 808, "step_time": 14.739710032037692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.5, "completions/mean_terminated_length": 4.6666669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5616520857438445, "epoch": 0.00809, "frac_reward_zero_std": 0.25, "grad_norm": 0.06757571548223495, "kl": 0.17935851961374283, "learning_rate": 7.999778773738826e-06, "loss": -0.0715, "num_tokens": 21405955.0, "reward": 0.6659345030784607, "reward_std": 1.3490869998931885, "rewards/rollout_reward_func/mean": 0.6659345030784607, "rewards/rollout_reward_func/std": 1.3490869998931885, "sampling/importance_sampling_ratio/max": 1.2520523071289062, "sampling/importance_sampling_ratio/mean": 0.7111777067184448, "sampling/importance_sampling_ratio/min": 0.00017383351223543286, "sampling/sampling_logp_difference/max": 1.86268949508667, "sampling/sampling_logp_difference/mean": 0.2824941873550415, "step": 809, "step_time": 29.817644714028575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5682267174124718, "epoch": 0.0081, "grad_norm": 0.06497394293546677, "kl": 0.17707400768995285, "learning_rate": 7.999778200992049e-06, "loss": -0.0717, "step": 810, "step_time": 14.148419653982273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.03125, "completions/mean_terminated_length": 6.0625, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.9045515954494476, "epoch": 0.00811, "frac_reward_zero_std": 0.0, "grad_norm": 0.06274095177650452, "kl": 0.09295652038417757, "learning_rate": 7.999777627504848e-06, "loss": -0.0805, "num_tokens": 21475879.0, "reward": -0.22049155831336975, "reward_std": 1.1180353164672852, "rewards/rollout_reward_func/mean": -0.22049155831336975, "rewards/rollout_reward_func/std": 1.1180353164672852, "sampling/importance_sampling_ratio/max": 1.423152208328247, "sampling/importance_sampling_ratio/mean": 0.3704395294189453, "sampling/importance_sampling_ratio/min": 1.2400439572957112e-06, "sampling/sampling_logp_difference/max": 2.5926449298858643, "sampling/sampling_logp_difference/mean": 0.4218041002750397, "step": 811, "step_time": 39.96259092701075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00552884628996253, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00552884628996253, "entropy": 2.9060023427009583, "epoch": 0.00812, "grad_norm": 0.06219665706157684, "kl": 0.09326797630637884, "learning_rate": 7.999777053277222e-06, "loss": -0.0809, "step": 812, "step_time": 16.29104209599609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 6.84375, "completions/mean_terminated_length": 4.730769634246826, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3407500768080354, "epoch": 0.00813, "frac_reward_zero_std": 0.25, "grad_norm": 0.030005265027284622, "kl": 0.42542761843651533, "learning_rate": 7.99977647830917e-06, "loss": -0.0564, "num_tokens": 21525258.0, "reward": 0.0775659829378128, "reward_std": 1.294883131980896, "rewards/rollout_reward_func/mean": 0.0775659829378128, "rewards/rollout_reward_func/std": 1.2948830127716064, "sampling/importance_sampling_ratio/max": 1.3268256187438965, "sampling/importance_sampling_ratio/mean": 0.7485955953598022, "sampling/importance_sampling_ratio/min": 1.8733438992057927e-05, "sampling/sampling_logp_difference/max": 2.5824737548828125, "sampling/sampling_logp_difference/mean": 0.25835326313972473, "step": 813, "step_time": 31.633061496017035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3416238687932491, "epoch": 0.00814, "grad_norm": 0.030449548736214638, "kl": 0.42579542845487595, "learning_rate": 7.999775902600696e-06, "loss": -0.0564, "step": 814, "step_time": 14.360393774986733 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.90625, "completions/mean_terminated_length": 6.130434989929199, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.298532173037529, "epoch": 0.00815, "frac_reward_zero_std": 0.25, "grad_norm": 0.07293827831745148, "kl": 0.1360325813293457, "learning_rate": 7.999775326151796e-06, "loss": -0.031, "num_tokens": 21581088.0, "reward": 0.1604556441307068, "reward_std": 1.3045355081558228, "rewards/rollout_reward_func/mean": 0.1604556441307068, "rewards/rollout_reward_func/std": 1.3045353889465332, "sampling/importance_sampling_ratio/max": 1.2694131135940552, "sampling/importance_sampling_ratio/mean": 0.5822460055351257, "sampling/importance_sampling_ratio/min": 1.782176514097955e-05, "sampling/sampling_logp_difference/max": 1.770348072052002, "sampling/sampling_logp_difference/mean": 0.3591846823692322, "step": 815, "step_time": 33.46096845802094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.297291897237301, "epoch": 0.00816, "grad_norm": 0.05622413009405136, "kl": 0.1362305972725153, "learning_rate": 7.999774748962473e-06, "loss": -0.0311, "step": 816, "step_time": 15.062981919996673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.75, "completions/mean_terminated_length": 4.400000095367432, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.325739562511444, "epoch": 0.00817, "frac_reward_zero_std": 0.0, "grad_norm": 0.04967117682099342, "kl": 0.2443841677159071, "learning_rate": 7.999774171032726e-06, "loss": -0.0806, "num_tokens": 21636511.0, "reward": 0.21169348061084747, "reward_std": 1.421091079711914, "rewards/rollout_reward_func/mean": 0.21169348061084747, "rewards/rollout_reward_func/std": 1.421091079711914, "sampling/importance_sampling_ratio/max": 1.2356895208358765, "sampling/importance_sampling_ratio/mean": 0.5646690130233765, "sampling/importance_sampling_ratio/min": 7.127275239326991e-06, "sampling/sampling_logp_difference/max": 2.114652633666992, "sampling/sampling_logp_difference/mean": 0.3604778051376343, "step": 817, "step_time": 30.81314421998104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.3270833045244217, "epoch": 0.00818, "grad_norm": 0.05221505090594292, "kl": 0.2509578801691532, "learning_rate": 7.999773592362555e-06, "loss": -0.0806, "step": 818, "step_time": 13.811004227027297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 8.3125, "completions/mean_terminated_length": 4.818181991577148, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.1900248415768147, "epoch": 0.00819, "frac_reward_zero_std": 0.25, "grad_norm": 0.2363852858543396, "kl": 0.2627465391997248, "learning_rate": 7.99977301295196e-06, "loss": -0.0899, "num_tokens": 21686534.0, "reward": 0.5183266997337341, "reward_std": 1.4401934146881104, "rewards/rollout_reward_func/mean": 0.5183266997337341, "rewards/rollout_reward_func/std": 1.4401934146881104, "sampling/importance_sampling_ratio/max": 1.2629417181015015, "sampling/importance_sampling_ratio/mean": 0.6314672231674194, "sampling/importance_sampling_ratio/min": 1.7801059584598988e-05, "sampling/sampling_logp_difference/max": 1.9638617038726807, "sampling/sampling_logp_difference/mean": 0.37227320671081543, "step": 819, "step_time": 30.70684182700643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.1845318973064423, "epoch": 0.0082, "grad_norm": 0.2178916037082672, "kl": 0.2689951441716403, "learning_rate": 7.999772432800943e-06, "loss": -0.0906, "step": 820, "step_time": 15.985771618012222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 9.84375, "completions/mean_terminated_length": 4.411764621734619, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2796359062194824, "epoch": 0.00821, "frac_reward_zero_std": 0.25, "grad_norm": 0.09118052572011948, "kl": 0.31023229705169797, "learning_rate": 7.999771851909502e-06, "loss": -0.059, "num_tokens": 21747002.0, "reward": 0.4718652069568634, "reward_std": 1.3207193613052368, "rewards/rollout_reward_func/mean": 0.4718652069568634, "rewards/rollout_reward_func/std": 1.3207193613052368, "sampling/importance_sampling_ratio/max": 1.2090930938720703, "sampling/importance_sampling_ratio/mean": 0.4748530089855194, "sampling/importance_sampling_ratio/min": 0.00022957034525461495, "sampling/sampling_logp_difference/max": 2.0306763648986816, "sampling/sampling_logp_difference/mean": 0.33261793851852417, "step": 821, "step_time": 36.67260050897312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.276114523410797, "epoch": 0.00822, "grad_norm": 0.09173552691936493, "kl": 0.3205284532159567, "learning_rate": 7.999771270277637e-06, "loss": -0.0587, "step": 822, "step_time": 15.196094867002103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 4.148148059844971, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3960515651851892, "epoch": 0.00823, "frac_reward_zero_std": 0.5, "grad_norm": 0.04934098199009895, "kl": 0.3632681518793106, "learning_rate": 7.99977068790535e-06, "loss": -0.0457, "num_tokens": 21793462.0, "reward": 0.07211989164352417, "reward_std": 1.0339837074279785, "rewards/rollout_reward_func/mean": 0.07211989164352417, "rewards/rollout_reward_func/std": 1.033983826637268, "sampling/importance_sampling_ratio/max": 1.3089828491210938, "sampling/importance_sampling_ratio/mean": 0.8400238156318665, "sampling/importance_sampling_ratio/min": 5.68306359127746e-06, "sampling/sampling_logp_difference/max": 1.9811015129089355, "sampling/sampling_logp_difference/mean": 0.29687780141830444, "step": 823, "step_time": 28.19691110297572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0052083334885537624, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "entropy": 1.3971812780946493, "epoch": 0.00824, "grad_norm": 0.044552914798259735, "kl": 0.3692216221243143, "learning_rate": 7.99977010479264e-06, "loss": -0.0459, "step": 824, "step_time": 15.32917669203016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 8.0625, "completions/mean_terminated_length": 4.454545497894287, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8904809756204486, "epoch": 0.00825, "frac_reward_zero_std": 0.5, "grad_norm": 0.09385872632265091, "kl": 0.4013822814449668, "learning_rate": 7.999769520939507e-06, "loss": -0.0061, "num_tokens": 21840445.0, "reward": 0.77082359790802, "reward_std": 1.3901535272598267, "rewards/rollout_reward_func/mean": 0.77082359790802, "rewards/rollout_reward_func/std": 1.3901535272598267, "sampling/importance_sampling_ratio/max": 1.235398769378662, "sampling/importance_sampling_ratio/mean": 0.6603244543075562, "sampling/importance_sampling_ratio/min": 1.0980786555592204e-06, "sampling/sampling_logp_difference/max": 1.956052541732788, "sampling/sampling_logp_difference/mean": 0.3394753932952881, "step": 825, "step_time": 27.38342644898512 }, { "clip_ratio/high_max": 0.0062500000931322575, "clip_ratio/high_mean": 0.0031250000465661287, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 1.8879795726388693, "epoch": 0.00826, "grad_norm": 0.08777827024459839, "kl": 0.33851241413503885, "learning_rate": 7.999768936345951e-06, "loss": -0.0064, "step": 826, "step_time": 12.114196408976568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 7.65625, "completions/mean_terminated_length": 4.3913044929504395, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.709820693358779, "epoch": 0.00827, "frac_reward_zero_std": 0.25, "grad_norm": 0.21889273822307587, "kl": 0.18169424775987864, "learning_rate": 7.999768351011975e-06, "loss": -0.0975, "num_tokens": 21894905.0, "reward": 0.9687581062316895, "reward_std": 1.162196397781372, "rewards/rollout_reward_func/mean": 0.9687581062316895, "rewards/rollout_reward_func/std": 1.162196397781372, "sampling/importance_sampling_ratio/max": 1.426922082901001, "sampling/importance_sampling_ratio/mean": 0.7864878177642822, "sampling/importance_sampling_ratio/min": 2.0485581444518175e-06, "sampling/sampling_logp_difference/max": 1.9639315605163574, "sampling/sampling_logp_difference/mean": 0.3661513924598694, "step": 827, "step_time": 29.22749675103114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7018459904938936, "epoch": 0.00828, "grad_norm": 0.17801563441753387, "kl": 0.18379682395607233, "learning_rate": 7.999767764937574e-06, "loss": -0.0985, "step": 828, "step_time": 13.537658189001377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.0625, "completions/mean_terminated_length": 4.222222328186035, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1890793554484844, "epoch": 0.00829, "frac_reward_zero_std": 0.0, "grad_norm": 0.19637419283390045, "kl": 0.7264032550156116, "learning_rate": 7.999767178122752e-06, "loss": -0.0363, "num_tokens": 21941147.0, "reward": 0.699528694152832, "reward_std": 1.3898553848266602, "rewards/rollout_reward_func/mean": 0.699528694152832, "rewards/rollout_reward_func/std": 1.3898555040359497, "sampling/importance_sampling_ratio/max": 1.0950180292129517, "sampling/importance_sampling_ratio/mean": 0.7401107549667358, "sampling/importance_sampling_ratio/min": 1.2577005463754176e-06, "sampling/sampling_logp_difference/max": 2.3141143321990967, "sampling/sampling_logp_difference/mean": 0.3307873606681824, "step": 829, "step_time": 22.266739382015658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1754957251250744, "epoch": 0.0083, "grad_norm": 0.17012809216976166, "kl": 0.6501066721975803, "learning_rate": 7.999766590567506e-06, "loss": -0.0375, "step": 830, "step_time": 11.871779776993208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.71875, "completions/mean_terminated_length": 4.655172348022461, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4190430310554802, "epoch": 0.00831, "frac_reward_zero_std": 0.0, "grad_norm": 0.12415929138660431, "kl": 0.5097209066152573, "learning_rate": 7.99976600227184e-06, "loss": -0.0622, "num_tokens": 21995192.0, "reward": 0.6119921207427979, "reward_std": 1.251985788345337, "rewards/rollout_reward_func/mean": 0.6119921207427979, "rewards/rollout_reward_func/std": 1.251985788345337, "sampling/importance_sampling_ratio/max": 1.2020491361618042, "sampling/importance_sampling_ratio/mean": 0.7270315885543823, "sampling/importance_sampling_ratio/min": 1.8439125142322155e-06, "sampling/sampling_logp_difference/max": 2.036864757537842, "sampling/sampling_logp_difference/mean": 0.29675373435020447, "step": 831, "step_time": 24.301189131016145 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.004807692486792803, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010016025975346565, "entropy": 1.4124177303165197, "epoch": 0.00832, "grad_norm": 0.12485604733228683, "kl": 0.4670321103185415, "learning_rate": 7.999765413235754e-06, "loss": -0.0623, "step": 832, "step_time": 12.322984521015314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.6875, "completions/mean_terminated_length": 4.620689868927002, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.90813227603212, "epoch": 0.00833, "frac_reward_zero_std": 0.25, "grad_norm": 0.08053357154130936, "kl": 0.5265615247189999, "learning_rate": 7.999764823459244e-06, "loss": -0.0479, "num_tokens": 22045405.0, "reward": 0.4065437912940979, "reward_std": 1.284038782119751, "rewards/rollout_reward_func/mean": 0.4065437912940979, "rewards/rollout_reward_func/std": 1.2840386629104614, "sampling/importance_sampling_ratio/max": 1.3944017887115479, "sampling/importance_sampling_ratio/mean": 0.8728670477867126, "sampling/importance_sampling_ratio/min": 4.579675987770315e-06, "sampling/sampling_logp_difference/max": 1.977802038192749, "sampling/sampling_logp_difference/mean": 0.19948887825012207, "step": 833, "step_time": 24.977521200984484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9144371207803488, "epoch": 0.00834, "grad_norm": 0.08410836011171341, "kl": 0.5182352364063263, "learning_rate": 7.999764232942311e-06, "loss": -0.0478, "step": 834, "step_time": 13.316428518985049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.78125, "completions/mean_terminated_length": 4.653846263885498, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2110541872680187, "epoch": 0.00835, "frac_reward_zero_std": 0.25, "grad_norm": 0.08460535854101181, "kl": 0.20023595914244652, "learning_rate": 7.999763641684959e-06, "loss": -0.0364, "num_tokens": 22103270.0, "reward": 0.9826273918151855, "reward_std": 1.2232065200805664, "rewards/rollout_reward_func/mean": 0.9826273918151855, "rewards/rollout_reward_func/std": 1.2232064008712769, "sampling/importance_sampling_ratio/max": 1.9199813604354858, "sampling/importance_sampling_ratio/mean": 0.803528904914856, "sampling/importance_sampling_ratio/min": 2.0255367417121306e-05, "sampling/sampling_logp_difference/max": 1.8431936502456665, "sampling/sampling_logp_difference/mean": 0.2124062329530716, "step": 835, "step_time": 35.634152472004644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2202677596360445, "epoch": 0.00836, "grad_norm": 0.0894872397184372, "kl": 0.2010080199688673, "learning_rate": 7.999763049687186e-06, "loss": -0.0367, "step": 836, "step_time": 15.788907912006835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 7.125, "completions/mean_terminated_length": 4.639999866485596, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5587147139012814, "epoch": 0.00837, "frac_reward_zero_std": 0.25, "grad_norm": 0.0837898701429367, "kl": 0.45766994543373585, "learning_rate": 7.999762456948991e-06, "loss": -0.0696, "num_tokens": 22165038.0, "reward": 0.2545872926712036, "reward_std": 1.2606559991836548, "rewards/rollout_reward_func/mean": 0.2545872926712036, "rewards/rollout_reward_func/std": 1.2606559991836548, "sampling/importance_sampling_ratio/max": 1.4362212419509888, "sampling/importance_sampling_ratio/mean": 0.7101244330406189, "sampling/importance_sampling_ratio/min": 1.71515166584868e-05, "sampling/sampling_logp_difference/max": 1.7100552320480347, "sampling/sampling_logp_difference/mean": 0.3076016902923584, "step": 837, "step_time": 33.56052908698621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.574344439432025, "epoch": 0.00838, "grad_norm": 0.08629237860441208, "kl": 0.4354298673570156, "learning_rate": 7.999761863470376e-06, "loss": -0.07, "step": 838, "step_time": 15.569676584011177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.78125, "completions/mean_terminated_length": 5.0740742683410645, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6504995040595531, "epoch": 0.00839, "frac_reward_zero_std": 0.0, "grad_norm": 0.10219860076904297, "kl": 0.4077482959255576, "learning_rate": 7.999761269251341e-06, "loss": -0.0929, "num_tokens": 22225429.0, "reward": 0.854490339756012, "reward_std": 1.2538272142410278, "rewards/rollout_reward_func/mean": 0.854490339756012, "rewards/rollout_reward_func/std": 1.2538272142410278, "sampling/importance_sampling_ratio/max": 1.3810157775878906, "sampling/importance_sampling_ratio/mean": 0.7772927284240723, "sampling/importance_sampling_ratio/min": 0.00014076165098231286, "sampling/sampling_logp_difference/max": 1.238528847694397, "sampling/sampling_logp_difference/mean": 0.2861870527267456, "step": 839, "step_time": 29.80495560700365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.6656342670321465, "epoch": 0.0084, "grad_norm": 0.11807248741388321, "kl": 0.39725597482174635, "learning_rate": 7.999760674291884e-06, "loss": -0.0928, "step": 840, "step_time": 14.158275801004493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 10.375, "completions/mean_terminated_length": 4.75, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.580590233206749, "epoch": 0.00841, "frac_reward_zero_std": 0.0, "grad_norm": 0.1737624853849411, "kl": 0.19676938047632575, "learning_rate": 7.999760078592008e-06, "loss": -0.0601, "num_tokens": 22288805.0, "reward": -0.4255425035953522, "reward_std": 0.877173900604248, "rewards/rollout_reward_func/mean": -0.4255425035953522, "rewards/rollout_reward_func/std": 0.877173900604248, "sampling/importance_sampling_ratio/max": 1.1603314876556396, "sampling/importance_sampling_ratio/mean": 0.3133147954940796, "sampling/importance_sampling_ratio/min": 1.4228744475985877e-06, "sampling/sampling_logp_difference/max": 1.830185890197754, "sampling/sampling_logp_difference/mean": 0.42756858468055725, "step": 841, "step_time": 31.931158828985645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.015625, "clip_ratio/region_mean": 0.0234375, "entropy": 2.596981629729271, "epoch": 0.00842, "grad_norm": 0.07913526892662048, "kl": 0.25114920642226934, "learning_rate": 7.99975948215171e-06, "loss": -0.061, "step": 842, "step_time": 13.778046594976331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 8.125, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.262938193976879, "epoch": 0.00843, "frac_reward_zero_std": 0.25, "grad_norm": 0.05106630548834801, "kl": 0.18742151092737913, "learning_rate": 7.999758884970993e-06, "loss": -0.0821, "num_tokens": 22342475.0, "reward": 0.7160197496414185, "reward_std": 1.3931519985198975, "rewards/rollout_reward_func/mean": 0.7160197496414185, "rewards/rollout_reward_func/std": 1.3931519985198975, "sampling/importance_sampling_ratio/max": 1.202612280845642, "sampling/importance_sampling_ratio/mean": 0.6242858171463013, "sampling/importance_sampling_ratio/min": 2.1456884269355214e-07, "sampling/sampling_logp_difference/max": 1.9901810884475708, "sampling/sampling_logp_difference/mean": 0.4152276813983917, "step": 843, "step_time": 28.447067039000103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 2.2686360608786345, "epoch": 0.00844, "grad_norm": 0.04163109138607979, "kl": 0.19669717364013195, "learning_rate": 7.999758287049855e-06, "loss": -0.0822, "step": 844, "step_time": 13.117287589018815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 6.78125, "completions/mean_terminated_length": 4.199999809265137, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7468414139002562, "epoch": 0.00845, "frac_reward_zero_std": 0.0, "grad_norm": 0.260591596364975, "kl": 0.4890011604875326, "learning_rate": 7.9997576883883e-06, "loss": -0.0367, "num_tokens": 22395926.0, "reward": -0.07597333192825317, "reward_std": 1.056001901626587, "rewards/rollout_reward_func/mean": -0.07597333192825317, "rewards/rollout_reward_func/std": 1.056001901626587, "sampling/importance_sampling_ratio/max": 1.1669371128082275, "sampling/importance_sampling_ratio/mean": 0.5916820764541626, "sampling/importance_sampling_ratio/min": 3.182150749125867e-06, "sampling/sampling_logp_difference/max": 1.7309551239013672, "sampling/sampling_logp_difference/mean": 0.3430739939212799, "step": 845, "step_time": 30.67439356999239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.769118751399219, "epoch": 0.00846, "grad_norm": 0.26040002703666687, "kl": 0.5129053890705109, "learning_rate": 7.999757088986322e-06, "loss": -0.038, "step": 846, "step_time": 14.567923939001048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.3125, "completions/mean_terminated_length": 5.75, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.712172769010067, "epoch": 0.00847, "frac_reward_zero_std": 0.0, "grad_norm": 0.09402909874916077, "kl": 0.2882496491074562, "learning_rate": 7.999756488843926e-06, "loss": -0.1032, "num_tokens": 22459378.0, "reward": 0.3744603991508484, "reward_std": 1.3610193729400635, "rewards/rollout_reward_func/mean": 0.3744603991508484, "rewards/rollout_reward_func/std": 1.361019253730774, "sampling/importance_sampling_ratio/max": 1.4958502054214478, "sampling/importance_sampling_ratio/mean": 0.6421072483062744, "sampling/importance_sampling_ratio/min": 6.588079850189388e-05, "sampling/sampling_logp_difference/max": 1.7487232685089111, "sampling/sampling_logp_difference/mean": 0.29949986934661865, "step": 847, "step_time": 31.241423858024064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7217946648597717, "epoch": 0.00848, "grad_norm": 0.09828653186559677, "kl": 0.2794149983674288, "learning_rate": 7.99975588796111e-06, "loss": -0.1029, "step": 848, "step_time": 14.560735437989933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 8.375, "completions/mean_terminated_length": 4.38095235824585, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6987540842965245, "epoch": 0.00849, "frac_reward_zero_std": 0.25, "grad_norm": 0.045446354895830154, "kl": 0.22490482591092587, "learning_rate": 7.999755286337875e-06, "loss": -0.0648, "num_tokens": 22513089.0, "reward": 0.12496226280927658, "reward_std": 1.2586897611618042, "rewards/rollout_reward_func/mean": 0.12496226280927658, "rewards/rollout_reward_func/std": 1.2586897611618042, "sampling/importance_sampling_ratio/max": 1.2588969469070435, "sampling/importance_sampling_ratio/mean": 0.658115029335022, "sampling/importance_sampling_ratio/min": 1.2826782949559856e-05, "sampling/sampling_logp_difference/max": 1.769754409790039, "sampling/sampling_logp_difference/mean": 0.30848509073257446, "step": 849, "step_time": 30.212280379011645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7032331936061382, "epoch": 0.0085, "grad_norm": 0.0447842963039875, "kl": 0.2241814574226737, "learning_rate": 7.999754683974221e-06, "loss": -0.0648, "step": 850, "step_time": 13.07718570898578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 5.714285850524902, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.274950757622719, "epoch": 0.00851, "frac_reward_zero_std": 0.25, "grad_norm": 0.10538400709629059, "kl": 0.34699054062366486, "learning_rate": 7.999754080870149e-06, "loss": -0.0598, "num_tokens": 22569063.0, "reward": 0.10474362969398499, "reward_std": 1.3143314123153687, "rewards/rollout_reward_func/mean": 0.10474362969398499, "rewards/rollout_reward_func/std": 1.3143314123153687, "sampling/importance_sampling_ratio/max": 1.2475603818893433, "sampling/importance_sampling_ratio/mean": 0.5358802080154419, "sampling/importance_sampling_ratio/min": 1.7316454048454943e-08, "sampling/sampling_logp_difference/max": 2.1551103591918945, "sampling/sampling_logp_difference/mean": 0.40895530581474304, "step": 851, "step_time": 33.25788236600056 }, { "clip_ratio/high_max": 0.005434782709926367, "clip_ratio/high_mean": 0.0027173913549631834, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005842391401529312, "entropy": 2.2793999314308167, "epoch": 0.00852, "grad_norm": 0.09539277106523514, "kl": 0.31157616153359413, "learning_rate": 7.999753477025658e-06, "loss": -0.0602, "step": 852, "step_time": 13.649329802006832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 4.727272987365723, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8177335038781166, "epoch": 0.00853, "frac_reward_zero_std": 0.0, "grad_norm": 0.14420540630817413, "kl": 0.25164174009114504, "learning_rate": 7.999752872440747e-06, "loss": -0.0941, "num_tokens": 22622866.0, "reward": 0.7281153798103333, "reward_std": 1.4482797384262085, "rewards/rollout_reward_func/mean": 0.7281153798103333, "rewards/rollout_reward_func/std": 1.4482797384262085, "sampling/importance_sampling_ratio/max": 1.2291990518569946, "sampling/importance_sampling_ratio/mean": 0.6121433973312378, "sampling/importance_sampling_ratio/min": 2.9447489851008868e-06, "sampling/sampling_logp_difference/max": 2.106415033340454, "sampling/sampling_logp_difference/mean": 0.3263317942619324, "step": 853, "step_time": 30.39775259900489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8139555901288986, "epoch": 0.00854, "grad_norm": 0.11060859262943268, "kl": 0.2446536896750331, "learning_rate": 7.999752267115419e-06, "loss": -0.0945, "step": 854, "step_time": 12.679433503988548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 5.71875, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2638525869697332, "epoch": 0.00855, "frac_reward_zero_std": 0.25, "grad_norm": 0.17919336259365082, "kl": 0.3332598805427551, "learning_rate": 7.99975166104967e-06, "loss": -0.0604, "num_tokens": 22673110.0, "reward": 0.9701986312866211, "reward_std": 1.1647520065307617, "rewards/rollout_reward_func/mean": 0.9701986312866211, "rewards/rollout_reward_func/std": 1.1647520065307617, "sampling/importance_sampling_ratio/max": 1.6778740882873535, "sampling/importance_sampling_ratio/mean": 0.8187087178230286, "sampling/importance_sampling_ratio/min": 9.661231160862371e-05, "sampling/sampling_logp_difference/max": 1.763305902481079, "sampling/sampling_logp_difference/mean": 0.2545988857746124, "step": 855, "step_time": 28.308382566974615 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.2512452378869057, "epoch": 0.00856, "grad_norm": 0.16173705458641052, "kl": 0.3149231933057308, "learning_rate": 7.999751054243507e-06, "loss": -0.0614, "step": 856, "step_time": 14.354910573005327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.5625, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6029117852449417, "epoch": 0.00857, "frac_reward_zero_std": 0.25, "grad_norm": 0.16637663543224335, "kl": 0.2831467743963003, "learning_rate": 7.999750446696924e-06, "loss": -0.0686, "num_tokens": 22729215.0, "reward": 0.5821689367294312, "reward_std": 1.3036518096923828, "rewards/rollout_reward_func/mean": 0.5821689367294312, "rewards/rollout_reward_func/std": 1.3036516904830933, "sampling/importance_sampling_ratio/max": 1.4821161031723022, "sampling/importance_sampling_ratio/mean": 0.6620761156082153, "sampling/importance_sampling_ratio/min": 3.542610329532181e-06, "sampling/sampling_logp_difference/max": 1.521705985069275, "sampling/sampling_logp_difference/mean": 0.32477548718452454, "step": 857, "step_time": 31.788660112011712 }, { "clip_ratio/high_max": 0.046875, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.005681818351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02911931835114956, "entropy": 1.5552074685692787, "epoch": 0.00858, "grad_norm": 0.08061956614255905, "kl": 0.3115537855774164, "learning_rate": 7.999749838409923e-06, "loss": -0.0697, "step": 858, "step_time": 14.619391611005994 }, { "clip_ratio/high_max": 0.005434782709926367, "clip_ratio/high_mean": 0.0027173913549631834, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0027173913549631834, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 10.375, "completions/mean_terminated_length": 6.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.7573926597833633, "epoch": 0.00859, "frac_reward_zero_std": 0.0, "grad_norm": 0.10756655037403107, "kl": 0.30442591570317745, "learning_rate": 7.999749229382504e-06, "loss": -0.0936, "num_tokens": 22791724.0, "reward": 0.34037458896636963, "reward_std": 1.3210355043411255, "rewards/rollout_reward_func/mean": 0.34037458896636963, "rewards/rollout_reward_func/std": 1.321035385131836, "sampling/importance_sampling_ratio/max": 1.164437174797058, "sampling/importance_sampling_ratio/mean": 0.4086873233318329, "sampling/importance_sampling_ratio/min": 7.756963213978452e-07, "sampling/sampling_logp_difference/max": 1.9339817762374878, "sampling/sampling_logp_difference/mean": 0.453988254070282, "step": 859, "step_time": 32.30989612299891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.754372626543045, "epoch": 0.0086, "grad_norm": 0.13983434438705444, "kl": 0.3470678189769387, "learning_rate": 7.999748619614667e-06, "loss": -0.0936, "step": 860, "step_time": 14.040453838024405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.65625, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8122158646583557, "epoch": 0.00861, "frac_reward_zero_std": 0.25, "grad_norm": 0.08235400170087814, "kl": 0.18032020516693592, "learning_rate": 7.999748009106413e-06, "loss": -0.0513, "num_tokens": 22842837.0, "reward": 0.3010290563106537, "reward_std": 1.3710765838623047, "rewards/rollout_reward_func/mean": 0.3010290563106537, "rewards/rollout_reward_func/std": 1.3710765838623047, "sampling/importance_sampling_ratio/max": 1.0579715967178345, "sampling/importance_sampling_ratio/mean": 0.5999240875244141, "sampling/importance_sampling_ratio/min": 1.4769055978192114e-09, "sampling/sampling_logp_difference/max": 1.998218059539795, "sampling/sampling_logp_difference/mean": 0.29900965094566345, "step": 861, "step_time": 33.38562512601493 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.012620192021131516, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012620192021131516, "entropy": 1.77313432097435, "epoch": 0.00862, "grad_norm": 0.04358995333313942, "kl": 0.17639536503702402, "learning_rate": 7.999747397857743e-06, "loss": -0.0518, "step": 862, "step_time": 13.45445575901249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.96875, "completions/mean_terminated_length": 4.931034564971924, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1362174088135362, "epoch": 0.00863, "frac_reward_zero_std": 0.25, "grad_norm": 0.37974557280540466, "kl": 0.28471879474818707, "learning_rate": 7.999746785868654e-06, "loss": -0.0437, "num_tokens": 22892507.0, "reward": 0.706282377243042, "reward_std": 1.4348759651184082, "rewards/rollout_reward_func/mean": 0.706282377243042, "rewards/rollout_reward_func/std": 1.4348759651184082, "sampling/importance_sampling_ratio/max": 1.281360149383545, "sampling/importance_sampling_ratio/mean": 0.8394398093223572, "sampling/importance_sampling_ratio/min": 3.0662217795907054e-06, "sampling/sampling_logp_difference/max": 1.7531640529632568, "sampling/sampling_logp_difference/mean": 0.24388357996940613, "step": 863, "step_time": 27.51011701299285 }, { "clip_ratio/high_max": 0.0062500000931322575, "clip_ratio/high_mean": 0.0031250000465661287, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 1.1231849249452353, "epoch": 0.00864, "grad_norm": 0.07363124191761017, "kl": 0.28324098140001297, "learning_rate": 7.999746173139148e-06, "loss": -0.0441, "step": 864, "step_time": 14.049237191022257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.5625, "completions/mean_terminated_length": 5.199999809265137, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4902400937862694, "epoch": 0.00865, "frac_reward_zero_std": 0.25, "grad_norm": 0.11895539611577988, "kl": 0.4549744566902518, "learning_rate": 7.999745559669226e-06, "loss": -0.0421, "num_tokens": 22946996.0, "reward": 0.4812585115432739, "reward_std": 1.3230632543563843, "rewards/rollout_reward_func/mean": 0.4812585115432739, "rewards/rollout_reward_func/std": 1.3230632543563843, "sampling/importance_sampling_ratio/max": 1.6096786260604858, "sampling/importance_sampling_ratio/mean": 0.6764642000198364, "sampling/importance_sampling_ratio/min": 2.0556582214226182e-08, "sampling/sampling_logp_difference/max": 1.9416108131408691, "sampling/sampling_logp_difference/mean": 0.29597213864326477, "step": 865, "step_time": 30.6422284989967 }, { "clip_ratio/high_max": 0.0062500000931322575, "clip_ratio/high_mean": 0.0031250000465661287, "clip_ratio/low_mean": 0.0026041667442768812, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00572916679084301, "entropy": 1.487520127557218, "epoch": 0.00866, "grad_norm": 0.09385829418897629, "kl": 0.46084657311439514, "learning_rate": 7.999744945458888e-06, "loss": -0.0425, "step": 866, "step_time": 12.888867680012481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 4.65625, "completions/mean_terminated_length": 4.290322303771973, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.38512743078172207, "epoch": 0.00867, "frac_reward_zero_std": 0.25, "grad_norm": 0.11932782828807831, "kl": 0.6914690416306257, "learning_rate": 7.999744330508132e-06, "loss": -0.0239, "num_tokens": 22990868.0, "reward": 1.7122564315795898, "reward_std": 0.6801016926765442, "rewards/rollout_reward_func/mean": 1.7122564315795898, "rewards/rollout_reward_func/std": 0.6801016330718994, "sampling/importance_sampling_ratio/max": 1.1585694551467896, "sampling/importance_sampling_ratio/mean": 0.943067193031311, "sampling/importance_sampling_ratio/min": 1.799899291654583e-05, "sampling/sampling_logp_difference/max": 1.5372042655944824, "sampling/sampling_logp_difference/mean": 0.11789734661579132, "step": 867, "step_time": 22.626186643989058 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "entropy": 0.3828506823629141, "epoch": 0.00868, "grad_norm": 0.08469238132238388, "kl": 0.5804418083280325, "learning_rate": 7.999743714816959e-06, "loss": -0.0245, "step": 868, "step_time": 12.91788753302535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 5.71875, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0769364088773727, "epoch": 0.00869, "frac_reward_zero_std": 0.0, "grad_norm": 0.0795246809720993, "kl": 0.291431438177824, "learning_rate": 7.999743098385372e-06, "loss": -0.0682, "num_tokens": 23036229.0, "reward": -0.3187766373157501, "reward_std": 1.0348314046859741, "rewards/rollout_reward_func/mean": -0.3187766373157501, "rewards/rollout_reward_func/std": 1.0348312854766846, "sampling/importance_sampling_ratio/max": 1.0769426822662354, "sampling/importance_sampling_ratio/mean": 0.7963117361068726, "sampling/importance_sampling_ratio/min": 9.848633908404736e-07, "sampling/sampling_logp_difference/max": 1.6030975580215454, "sampling/sampling_logp_difference/mean": 0.2354462891817093, "step": 869, "step_time": 21.248404054000275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.076087087392807, "epoch": 0.0087, "grad_norm": 0.0815025269985199, "kl": 0.2881446052342653, "learning_rate": 7.999742481213369e-06, "loss": -0.068, "step": 870, "step_time": 11.681640299007995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 5.96875, "completions/mean_terminated_length": 4.111111164093018, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2050539627671242, "epoch": 0.00871, "frac_reward_zero_std": 0.25, "grad_norm": 0.12774643301963806, "kl": 0.27372000366449356, "learning_rate": 7.999741863300948e-06, "loss": -0.0488, "num_tokens": 23088812.0, "reward": 0.14097647368907928, "reward_std": 1.2064933776855469, "rewards/rollout_reward_func/mean": 0.14097647368907928, "rewards/rollout_reward_func/std": 1.2064932584762573, "sampling/importance_sampling_ratio/max": 1.249153733253479, "sampling/importance_sampling_ratio/mean": 0.8157291412353516, "sampling/importance_sampling_ratio/min": 1.3791728292744665e-07, "sampling/sampling_logp_difference/max": 1.8717619180679321, "sampling/sampling_logp_difference/mean": 0.25456076860427856, "step": 871, "step_time": 39.38612051801465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 1.2440750896930695, "epoch": 0.00872, "grad_norm": 0.10021340847015381, "kl": 0.27330511435866356, "learning_rate": 7.999741244648114e-06, "loss": -0.0492, "step": 872, "step_time": 20.9954585100204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.90625, "completions/mean_terminated_length": 4.360000133514404, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4174352884292603, "epoch": 0.00873, "frac_reward_zero_std": 0.0, "grad_norm": 0.3107815980911255, "kl": 0.2267369730398059, "learning_rate": 7.999740625254861e-06, "loss": -0.0546, "num_tokens": 23149959.0, "reward": -0.07577939331531525, "reward_std": 1.0990967750549316, "rewards/rollout_reward_func/mean": -0.07577939331531525, "rewards/rollout_reward_func/std": 1.0990968942642212, "sampling/importance_sampling_ratio/max": 1.4802451133728027, "sampling/importance_sampling_ratio/mean": 0.7046911716461182, "sampling/importance_sampling_ratio/min": 1.355706262984313e-05, "sampling/sampling_logp_difference/max": 1.8779683113098145, "sampling/sampling_logp_difference/mean": 0.276518851518631, "step": 873, "step_time": 30.29615366102371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.06621170975267887, "clip_ratio/low_min": 0.027309782803058624, "clip_ratio/region_mean": 0.06621170975267887, "entropy": 1.565369602292776, "epoch": 0.00874, "grad_norm": 0.23968157172203064, "kl": 0.23503000009804964, "learning_rate": 7.999740005121195e-06, "loss": -0.058, "step": 874, "step_time": 14.071740792001947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.21875, "completions/mean_terminated_length": 5.192307949066162, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6924396101385355, "epoch": 0.00875, "frac_reward_zero_std": 0.25, "grad_norm": 0.17920951545238495, "kl": 0.22042311448603868, "learning_rate": 7.999739384247114e-06, "loss": -0.0673, "num_tokens": 23207099.0, "reward": 1.0110979080200195, "reward_std": 1.17205810546875, "rewards/rollout_reward_func/mean": 1.0110979080200195, "rewards/rollout_reward_func/std": 1.1720579862594604, "sampling/importance_sampling_ratio/max": 1.6227596998214722, "sampling/importance_sampling_ratio/mean": 0.7372153401374817, "sampling/importance_sampling_ratio/min": 1.035067907650955e-05, "sampling/sampling_logp_difference/max": 1.7256028652191162, "sampling/sampling_logp_difference/mean": 0.30116409063339233, "step": 875, "step_time": 30.56205374300771 }, { "clip_ratio/high_max": 0.005434782709926367, "clip_ratio/high_mean": 0.0027173913549631834, "clip_ratio/low_mean": 0.018914473708719015, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0216318650636822, "entropy": 1.730198934674263, "epoch": 0.00876, "grad_norm": 0.08635500818490982, "kl": 0.21336886007338762, "learning_rate": 7.999738762632617e-06, "loss": -0.068, "step": 876, "step_time": 13.798197069991147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 10.1875, "completions/mean_terminated_length": 6.210526466369629, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.6503407061100006, "epoch": 0.00877, "frac_reward_zero_std": 0.0, "grad_norm": 0.11698930710554123, "kl": 0.2890829723328352, "learning_rate": 7.999738140277704e-06, "loss": -0.0904, "num_tokens": 23269648.0, "reward": -0.20020759105682373, "reward_std": 0.9203664660453796, "rewards/rollout_reward_func/mean": -0.20020759105682373, "rewards/rollout_reward_func/std": 0.9203664064407349, "sampling/importance_sampling_ratio/max": 1.5285460948944092, "sampling/importance_sampling_ratio/mean": 0.38505813479423523, "sampling/importance_sampling_ratio/min": 2.5261681635413424e-09, "sampling/sampling_logp_difference/max": 2.1997902393341064, "sampling/sampling_logp_difference/mean": 0.4368364214897156, "step": 877, "step_time": 36.09405529400101 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.008971292059868574, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010924417059868574, "entropy": 2.696857050061226, "epoch": 0.00878, "grad_norm": 0.11161115020513535, "kl": 0.276045897975564, "learning_rate": 7.999737517182377e-06, "loss": -0.0899, "step": 878, "step_time": 15.226233768989914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.9375, "completions/mean_terminated_length": 5.727272987365723, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.335028797388077, "epoch": 0.00879, "frac_reward_zero_std": 0.0, "grad_norm": 0.09258928894996643, "kl": 0.5598417976871133, "learning_rate": 7.999736893346636e-06, "loss": -0.0692, "num_tokens": 23330782.0, "reward": 0.4766065776348114, "reward_std": 1.3120075464248657, "rewards/rollout_reward_func/mean": 0.4766065776348114, "rewards/rollout_reward_func/std": 1.3120075464248657, "sampling/importance_sampling_ratio/max": 1.3438233137130737, "sampling/importance_sampling_ratio/mean": 0.5420747995376587, "sampling/importance_sampling_ratio/min": 1.7707550703516262e-10, "sampling/sampling_logp_difference/max": 2.406017780303955, "sampling/sampling_logp_difference/mean": 0.3928374648094177, "step": 879, "step_time": 36.61271161698096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.008333333767950535, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008333333767950535, "entropy": 2.3459922522306442, "epoch": 0.0088, "grad_norm": 0.08847274631261826, "kl": 0.5847416687756777, "learning_rate": 7.99973626877048e-06, "loss": -0.0692, "step": 880, "step_time": 15.873688876992674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 11.03125, "completions/mean_terminated_length": 6.0625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 3.208832949399948, "epoch": 0.00881, "frac_reward_zero_std": 0.0, "grad_norm": 0.13517329096794128, "kl": 0.2755015429574996, "learning_rate": 7.99973564345391e-06, "loss": -0.0588, "num_tokens": 23396118.0, "reward": -0.2933269143104553, "reward_std": 1.033454418182373, "rewards/rollout_reward_func/mean": -0.2933269143104553, "rewards/rollout_reward_func/std": 1.0334545373916626, "sampling/importance_sampling_ratio/max": 1.3855664730072021, "sampling/importance_sampling_ratio/mean": 0.21853873133659363, "sampling/importance_sampling_ratio/min": 8.953730343819188e-07, "sampling/sampling_logp_difference/max": 2.054938316345215, "sampling/sampling_logp_difference/mean": 0.4884229004383087, "step": 881, "step_time": 33.82446630098275 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "entropy": 3.2203530371189117, "epoch": 0.00882, "grad_norm": 0.12283316254615784, "kl": 0.27495782473124564, "learning_rate": 7.999735017396925e-06, "loss": -0.059, "step": 882, "step_time": 13.84538430502289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 11.4375, "completions/mean_terminated_length": 6.266666889190674, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.4657051861286163, "epoch": 0.00883, "frac_reward_zero_std": 0.0, "grad_norm": 0.11129654198884964, "kl": 0.20672452077269554, "learning_rate": 7.999734390599525e-06, "loss": -0.0644, "num_tokens": 23459555.0, "reward": -0.5931673049926758, "reward_std": 0.7815883755683899, "rewards/rollout_reward_func/mean": -0.5931673049926758, "rewards/rollout_reward_func/std": 0.7815883755683899, "sampling/importance_sampling_ratio/max": 1.0529805421829224, "sampling/importance_sampling_ratio/mean": 0.23441776633262634, "sampling/importance_sampling_ratio/min": 1.1488022266803455e-07, "sampling/sampling_logp_difference/max": 2.0200746059417725, "sampling/sampling_logp_difference/mean": 0.5162185430526733, "step": 883, "step_time": 38.36541446601041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.4482440650463104, "epoch": 0.00884, "grad_norm": 0.11206028610467911, "kl": 0.19740554643794894, "learning_rate": 7.999733763061714e-06, "loss": -0.065, "step": 884, "step_time": 16.03043996602355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 11.375, "completions/mean_terminated_length": 6.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.0436224341392517, "epoch": 0.00885, "frac_reward_zero_std": 0.0, "grad_norm": 0.18329909443855286, "kl": 1.0361698507331312, "learning_rate": 7.999733134783487e-06, "loss": -0.1, "num_tokens": 23525066.0, "reward": 0.2870763838291168, "reward_std": 1.2547532320022583, "rewards/rollout_reward_func/mean": 0.2870763838291168, "rewards/rollout_reward_func/std": 1.2547532320022583, "sampling/importance_sampling_ratio/max": 1.2050834894180298, "sampling/importance_sampling_ratio/mean": 0.37856969237327576, "sampling/importance_sampling_ratio/min": 5.835124605724218e-10, "sampling/sampling_logp_difference/max": 2.3366646766662598, "sampling/sampling_logp_difference/mean": 0.5168930888175964, "step": 885, "step_time": 37.75929519798956 }, { "clip_ratio/high_max": 0.015050167683511972, "clip_ratio/high_mean": 0.007525083841755986, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007525083841755986, "entropy": 3.042053371667862, "epoch": 0.00886, "grad_norm": 0.1697283238172531, "kl": 0.8721002410165966, "learning_rate": 7.999732505764847e-06, "loss": -0.1005, "step": 886, "step_time": 15.261040008015698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 9.65625, "completions/mean_terminated_length": 5.315789699554443, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.3692525383085012, "epoch": 0.00887, "frac_reward_zero_std": 0.0, "grad_norm": 0.11278073489665985, "kl": 0.16241752170026302, "learning_rate": 7.999731876005793e-06, "loss": -0.0901, "num_tokens": 23585294.0, "reward": 0.5205057263374329, "reward_std": 1.2901793718338013, "rewards/rollout_reward_func/mean": 0.5205057263374329, "rewards/rollout_reward_func/std": 1.2901793718338013, "sampling/importance_sampling_ratio/max": 1.181312084197998, "sampling/importance_sampling_ratio/mean": 0.4680028557777405, "sampling/importance_sampling_ratio/min": 1.3672085685811908e-07, "sampling/sampling_logp_difference/max": 2.093235731124878, "sampling/sampling_logp_difference/mean": 0.4301164448261261, "step": 887, "step_time": 34.79771209800674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.3549126852303743, "epoch": 0.00888, "grad_norm": 0.10885592550039291, "kl": 0.1622416004538536, "learning_rate": 7.999731245506327e-06, "loss": -0.0904, "step": 888, "step_time": 16.11808682201081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 9.6875, "completions/mean_terminated_length": 5.900000095367432, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.4829954653978348, "epoch": 0.00889, "frac_reward_zero_std": 0.25, "grad_norm": 0.05858520790934563, "kl": 0.15378512907773256, "learning_rate": 7.999730614266446e-06, "loss": -0.0357, "num_tokens": 23641410.0, "reward": 0.2786274254322052, "reward_std": 1.3860228061676025, "rewards/rollout_reward_func/mean": 0.2786274254322052, "rewards/rollout_reward_func/std": 1.3860228061676025, "sampling/importance_sampling_ratio/max": 1.0935763120651245, "sampling/importance_sampling_ratio/mean": 0.3996817171573639, "sampling/importance_sampling_ratio/min": 0.00014673848636448383, "sampling/sampling_logp_difference/max": 2.019134521484375, "sampling/sampling_logp_difference/mean": 0.3761349022388458, "step": 889, "step_time": 33.29665996399126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.4779874831438065, "epoch": 0.0089, "grad_norm": 0.06053842604160309, "kl": 0.15400753961876035, "learning_rate": 7.999729982286154e-06, "loss": -0.0359, "step": 890, "step_time": 14.09982932699495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.90625, "completions/mean_terminated_length": 4.360000133514404, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5952055528759956, "epoch": 0.00891, "frac_reward_zero_std": 0.25, "grad_norm": 0.1727244108915329, "kl": 0.42866148240864277, "learning_rate": 7.999729349565447e-06, "loss": -0.0168, "num_tokens": 23689057.0, "reward": 1.124790906906128, "reward_std": 1.290047287940979, "rewards/rollout_reward_func/mean": 1.124790906906128, "rewards/rollout_reward_func/std": 1.290047287940979, "sampling/importance_sampling_ratio/max": 1.1760388612747192, "sampling/importance_sampling_ratio/mean": 0.6701521873474121, "sampling/importance_sampling_ratio/min": 2.2351348150095873e-07, "sampling/sampling_logp_difference/max": 2.1291606426239014, "sampling/sampling_logp_difference/mean": 0.294899582862854, "step": 891, "step_time": 26.91255719399487 }, { "clip_ratio/high_max": 0.0062500000931322575, "clip_ratio/high_mean": 0.0031250000465661287, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010937500046566129, "entropy": 1.5868337154388428, "epoch": 0.00892, "grad_norm": 0.10939602553844452, "kl": 0.4241825984790921, "learning_rate": 7.99972871610433e-06, "loss": -0.0173, "step": 892, "step_time": 12.590734429017175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.5625, "completions/mean_terminated_length": 6.083333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.099562093615532, "epoch": 0.00893, "frac_reward_zero_std": 0.25, "grad_norm": 0.04264507070183754, "kl": 0.22420657146722078, "learning_rate": 7.999728081902798e-06, "loss": -0.0525, "num_tokens": 23744866.0, "reward": 0.6877243518829346, "reward_std": 1.3491852283477783, "rewards/rollout_reward_func/mean": 0.6877243518829346, "rewards/rollout_reward_func/std": 1.3491851091384888, "sampling/importance_sampling_ratio/max": 1.404069423675537, "sampling/importance_sampling_ratio/mean": 0.5876826047897339, "sampling/importance_sampling_ratio/min": 4.628015631169546e-06, "sampling/sampling_logp_difference/max": 1.9688678979873657, "sampling/sampling_logp_difference/mean": 0.33807411789894104, "step": 893, "step_time": 33.52214311699208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.094301849603653, "epoch": 0.00894, "grad_norm": 0.04306407272815704, "kl": 0.2153262160718441, "learning_rate": 7.999727446960856e-06, "loss": -0.0526, "step": 894, "step_time": 13.836068119999254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 10.71875, "completions/mean_terminated_length": 5.4375, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.7339548468589783, "epoch": 0.00895, "frac_reward_zero_std": 0.0, "grad_norm": 0.12893129885196686, "kl": 0.1873420411720872, "learning_rate": 7.999726811278499e-06, "loss": -0.0599, "num_tokens": 23806121.0, "reward": 0.15213948488235474, "reward_std": 1.192610263824463, "rewards/rollout_reward_func/mean": 0.15213948488235474, "rewards/rollout_reward_func/std": 1.1926103830337524, "sampling/importance_sampling_ratio/max": 1.1824167966842651, "sampling/importance_sampling_ratio/mean": 0.3642878532409668, "sampling/importance_sampling_ratio/min": 2.7257067358732456e-07, "sampling/sampling_logp_difference/max": 2.3444621562957764, "sampling/sampling_logp_difference/mean": 0.4092944264411926, "step": 895, "step_time": 40.696243545011384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.7341893911361694, "epoch": 0.00896, "grad_norm": 0.12232881784439087, "kl": 0.18819135567173362, "learning_rate": 7.99972617485573e-06, "loss": -0.0602, "step": 896, "step_time": 15.738952315994538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 10.09375, "completions/mean_terminated_length": 5.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.408072307705879, "epoch": 0.00897, "frac_reward_zero_std": 0.25, "grad_norm": 0.1770011931657791, "kl": 0.13463934184983373, "learning_rate": 7.999725537692552e-06, "loss": -0.0518, "num_tokens": 23861789.0, "reward": 0.38223880529403687, "reward_std": 1.3815020322799683, "rewards/rollout_reward_func/mean": 0.38223880529403687, "rewards/rollout_reward_func/std": 1.3815019130706787, "sampling/importance_sampling_ratio/max": 1.2467657327651978, "sampling/importance_sampling_ratio/mean": 0.5089111924171448, "sampling/importance_sampling_ratio/min": 3.2519050563450946e-08, "sampling/sampling_logp_difference/max": 2.2799484729766846, "sampling/sampling_logp_difference/mean": 0.41741281747817993, "step": 897, "step_time": 32.9739128860092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.3984537944197655, "epoch": 0.00898, "grad_norm": 0.1627361923456192, "kl": 0.13588682608678937, "learning_rate": 7.99972489978896e-06, "loss": -0.0529, "step": 898, "step_time": 12.63915502197051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 10.90625, "completions/mean_terminated_length": 6.94444465637207, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.9239729046821594, "epoch": 0.00899, "frac_reward_zero_std": 0.0, "grad_norm": 0.06371793150901794, "kl": 0.39936038944870234, "learning_rate": 7.999724261144958e-06, "loss": -0.0624, "num_tokens": 23925172.0, "reward": 0.10817907005548477, "reward_std": 1.2809429168701172, "rewards/rollout_reward_func/mean": 0.10817907005548477, "rewards/rollout_reward_func/std": 1.2809429168701172, "sampling/importance_sampling_ratio/max": 1.2491339445114136, "sampling/importance_sampling_ratio/mean": 0.35074347257614136, "sampling/importance_sampling_ratio/min": 3.227204615541268e-06, "sampling/sampling_logp_difference/max": 2.4015402793884277, "sampling/sampling_logp_difference/mean": 0.4595852494239807, "step": 899, "step_time": 37.207430407986976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005627394653856754, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005627394653856754, "entropy": 2.9209894835948944, "epoch": 0.009, "grad_norm": 0.06240848824381828, "kl": 0.42372704297304153, "learning_rate": 7.999723621760544e-06, "loss": -0.0623, "step": 900, "step_time": 15.63746687398816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.125, "completions/mean_terminated_length": 5.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.707261223345995, "epoch": 0.00901, "frac_reward_zero_std": 0.0, "grad_norm": 0.08314339071512222, "kl": 0.2161787822842598, "learning_rate": 7.999722981635718e-06, "loss": -0.08, "num_tokens": 23973085.0, "reward": 0.7741222381591797, "reward_std": 1.4288976192474365, "rewards/rollout_reward_func/mean": 0.7741222381591797, "rewards/rollout_reward_func/std": 1.4288976192474365, "sampling/importance_sampling_ratio/max": 1.1611382961273193, "sampling/importance_sampling_ratio/mean": 0.6246079802513123, "sampling/importance_sampling_ratio/min": 4.01899370672254e-07, "sampling/sampling_logp_difference/max": 2.0377774238586426, "sampling/sampling_logp_difference/mean": 0.3034050762653351, "step": 901, "step_time": 31.469548026987468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004464285913854837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "entropy": 1.7004433684051037, "epoch": 0.00902, "grad_norm": 0.09568247199058533, "kl": 0.22487741662189364, "learning_rate": 7.999722340770481e-06, "loss": -0.08, "step": 902, "step_time": 12.704115755012026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.8125, "completions/mean_terminated_length": 6.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.422443710267544, "epoch": 0.00903, "frac_reward_zero_std": 0.0, "grad_norm": 0.12593334913253784, "kl": 0.327782872132957, "learning_rate": 7.999721699164835e-06, "loss": -0.0746, "num_tokens": 24026298.0, "reward": 0.09345892071723938, "reward_std": 1.0552126169204712, "rewards/rollout_reward_func/mean": 0.09345892071723938, "rewards/rollout_reward_func/std": 1.0552127361297607, "sampling/importance_sampling_ratio/max": 1.1173430681228638, "sampling/importance_sampling_ratio/mean": 0.41955769062042236, "sampling/importance_sampling_ratio/min": 4.359889658189786e-07, "sampling/sampling_logp_difference/max": 1.9933580160140991, "sampling/sampling_logp_difference/mean": 0.4409486651420593, "step": 903, "step_time": 32.87648254400119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 2.3918033093214035, "epoch": 0.00904, "grad_norm": 0.11653409153223038, "kl": 0.359276975505054, "learning_rate": 7.999721056818777e-06, "loss": -0.075, "step": 904, "step_time": 14.179035122011555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004807692486792803, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004807692486792803, "completions/clipped_ratio": 0.40625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 10.03125, "completions/mean_terminated_length": 5.947368621826172, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.890295371413231, "epoch": 0.00905, "frac_reward_zero_std": 0.0, "grad_norm": 0.036220017820596695, "kl": 0.21619955962523818, "learning_rate": 7.999720413732308e-06, "loss": -0.1027, "num_tokens": 24091323.0, "reward": 0.3530122935771942, "reward_std": 1.3360177278518677, "rewards/rollout_reward_func/mean": 0.3530122935771942, "rewards/rollout_reward_func/std": 1.3360177278518677, "sampling/importance_sampling_ratio/max": 1.0839533805847168, "sampling/importance_sampling_ratio/mean": 0.4032893180847168, "sampling/importance_sampling_ratio/min": 8.765414172273722e-09, "sampling/sampling_logp_difference/max": 1.9396679401397705, "sampling/sampling_logp_difference/mean": 0.4620203375816345, "step": 905, "step_time": 34.56363878099364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.8842604607343674, "epoch": 0.00906, "grad_norm": 0.03473438695073128, "kl": 0.22200241591781378, "learning_rate": 7.999719769905428e-06, "loss": -0.1025, "step": 906, "step_time": 13.83146853701328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.28125, "completions/mean_terminated_length": 5.708333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9756785742938519, "epoch": 0.00907, "frac_reward_zero_std": 0.0, "grad_norm": 0.025262823328375816, "kl": 0.23813257087022066, "learning_rate": 7.999719125338138e-06, "loss": -0.0919, "num_tokens": 24149800.0, "reward": 0.5501958727836609, "reward_std": 1.2683972120285034, "rewards/rollout_reward_func/mean": 0.5501958727836609, "rewards/rollout_reward_func/std": 1.2683972120285034, "sampling/importance_sampling_ratio/max": 1.3178895711898804, "sampling/importance_sampling_ratio/mean": 0.6366909742355347, "sampling/importance_sampling_ratio/min": 8.333153346029576e-06, "sampling/sampling_logp_difference/max": 2.73537540435791, "sampling/sampling_logp_difference/mean": 0.36813437938690186, "step": 907, "step_time": 33.88847147600609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0034722222480922937, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034722222480922937, "entropy": 1.9570847935974598, "epoch": 0.00908, "grad_norm": 0.023184673860669136, "kl": 0.24781683646142483, "learning_rate": 7.999718480030437e-06, "loss": -0.0919, "step": 908, "step_time": 14.274996067979373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 5.090909004211426, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.0155265778303146, "epoch": 0.00909, "frac_reward_zero_std": 0.25, "grad_norm": 0.057366952300071716, "kl": 0.2738102590665221, "learning_rate": 7.999717833982328e-06, "loss": -0.0691, "num_tokens": 24201001.0, "reward": 0.2955363988876343, "reward_std": 1.3044711351394653, "rewards/rollout_reward_func/mean": 0.2955363988876343, "rewards/rollout_reward_func/std": 1.3044711351394653, "sampling/importance_sampling_ratio/max": 1.1517486572265625, "sampling/importance_sampling_ratio/mean": 0.5581563711166382, "sampling/importance_sampling_ratio/min": 6.9671500568802e-07, "sampling/sampling_logp_difference/max": 2.114833116531372, "sampling/sampling_logp_difference/mean": 0.35426491498947144, "step": 909, "step_time": 35.574876559956465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005842391401529312, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005842391401529312, "entropy": 1.9969817847013474, "epoch": 0.0091, "grad_norm": 0.046685874462127686, "kl": 0.27962086629122496, "learning_rate": 7.99971718719381e-06, "loss": -0.0692, "step": 910, "step_time": 13.825424166003359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.15625, "completions/mean_terminated_length": 5.08695650100708, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.1104585975408554, "epoch": 0.00911, "frac_reward_zero_std": 0.25, "grad_norm": 0.08388110250234604, "kl": 0.32866284903138876, "learning_rate": 7.999716539664878e-06, "loss": -0.0518, "num_tokens": 24260761.0, "reward": 0.4848933517932892, "reward_std": 1.2932931184768677, "rewards/rollout_reward_func/mean": 0.4848933517932892, "rewards/rollout_reward_func/std": 1.2932929992675781, "sampling/importance_sampling_ratio/max": 1.1662064790725708, "sampling/importance_sampling_ratio/mean": 0.6284805536270142, "sampling/importance_sampling_ratio/min": 2.1279786821537527e-08, "sampling/sampling_logp_difference/max": 2.41355037689209, "sampling/sampling_logp_difference/mean": 0.3930402398109436, "step": 911, "step_time": 32.327402761016856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.1075804829597473, "epoch": 0.00912, "grad_norm": 0.08025620132684708, "kl": 0.32105002272874117, "learning_rate": 7.99971589139554e-06, "loss": -0.052, "step": 912, "step_time": 15.880515894983546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.4375, "completions/mean_terminated_length": 4.583333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.074518034234643, "epoch": 0.00913, "frac_reward_zero_std": 0.25, "grad_norm": 0.07987822592258453, "kl": 0.5959853250533342, "learning_rate": 7.999715242385793e-06, "loss": -0.055, "num_tokens": 24316310.0, "reward": 0.43136066198349, "reward_std": 1.3177735805511475, "rewards/rollout_reward_func/mean": 0.43136066198349, "rewards/rollout_reward_func/std": 1.3177735805511475, "sampling/importance_sampling_ratio/max": 1.1696676015853882, "sampling/importance_sampling_ratio/mean": 0.6453860998153687, "sampling/importance_sampling_ratio/min": 8.633984152628393e-12, "sampling/sampling_logp_difference/max": 2.404430866241455, "sampling/sampling_logp_difference/mean": 0.4007011950016022, "step": 913, "step_time": 33.08554074499989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0686774477362633, "epoch": 0.00914, "grad_norm": 0.08026701956987381, "kl": 0.6005471274256706, "learning_rate": 7.999714592635635e-06, "loss": -0.055, "step": 914, "step_time": 16.572195507993456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.3125, "completions/mean_terminated_length": 4.880000114440918, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.9581867158412933, "epoch": 0.00915, "frac_reward_zero_std": 0.0, "grad_norm": 0.07059754431247711, "kl": 0.3613625206053257, "learning_rate": 7.999713942145069e-06, "loss": -0.0842, "num_tokens": 24374979.0, "reward": 0.9459461569786072, "reward_std": 1.231093406677246, "rewards/rollout_reward_func/mean": 0.9459461569786072, "rewards/rollout_reward_func/std": 1.231093406677246, "sampling/importance_sampling_ratio/max": 1.1937693357467651, "sampling/importance_sampling_ratio/mean": 0.6957184076309204, "sampling/importance_sampling_ratio/min": 4.29255342382362e-09, "sampling/sampling_logp_difference/max": 2.124350070953369, "sampling/sampling_logp_difference/mean": 0.4000921845436096, "step": 915, "step_time": 31.560251729984884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9631750285625458, "epoch": 0.00916, "grad_norm": 0.06810065358877182, "kl": 0.35883661918342113, "learning_rate": 7.999713290914094e-06, "loss": -0.0844, "step": 916, "step_time": 14.547829196962994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 7.375, "completions/mean_terminated_length": 4.5, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6652565523982048, "epoch": 0.00917, "frac_reward_zero_std": 0.25, "grad_norm": 0.0800866186618805, "kl": 0.3231171667575836, "learning_rate": 7.99971263894271e-06, "loss": -0.0555, "num_tokens": 24423027.0, "reward": 0.9562541246414185, "reward_std": 1.287745714187622, "rewards/rollout_reward_func/mean": 0.9562541246414185, "rewards/rollout_reward_func/std": 1.2877455949783325, "sampling/importance_sampling_ratio/max": 1.1578145027160645, "sampling/importance_sampling_ratio/mean": 0.6494570374488831, "sampling/importance_sampling_ratio/min": 7.334948008974607e-07, "sampling/sampling_logp_difference/max": 1.8652937412261963, "sampling/sampling_logp_difference/mean": 0.2813035845756531, "step": 917, "step_time": 31.323785660017165 }, { "clip_ratio/high_max": 0.0062500000931322575, "clip_ratio/high_mean": 0.0031250000465661287, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 1.66634152084589, "epoch": 0.00918, "grad_norm": 0.04774227365851402, "kl": 0.3121175952255726, "learning_rate": 7.999711986230917e-06, "loss": -0.0557, "step": 918, "step_time": 12.838813895985368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.875, "completions/mean_terminated_length": 4.18181848526001, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9712503720074892, "epoch": 0.00919, "frac_reward_zero_std": 0.0, "grad_norm": 0.07564422488212585, "kl": 0.20369299547746778, "learning_rate": 7.999711332778717e-06, "loss": -0.0883, "num_tokens": 24485752.0, "reward": 0.6283324956893921, "reward_std": 1.2388710975646973, "rewards/rollout_reward_func/mean": 0.6283324956893921, "rewards/rollout_reward_func/std": 1.2388710975646973, "sampling/importance_sampling_ratio/max": 1.2545816898345947, "sampling/importance_sampling_ratio/mean": 0.6493502259254456, "sampling/importance_sampling_ratio/min": 4.2950063061653054e-07, "sampling/sampling_logp_difference/max": 1.8951196670532227, "sampling/sampling_logp_difference/mean": 0.3495234251022339, "step": 919, "step_time": 32.87687367401668 }, { "clip_ratio/high_max": 0.021875000093132257, "clip_ratio/high_mean": 0.010937500046566129, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010937500046566129, "entropy": 1.9635855806991458, "epoch": 0.0092, "grad_norm": 0.05025416612625122, "kl": 0.20726135233417153, "learning_rate": 7.999710678586108e-06, "loss": -0.0886, "step": 920, "step_time": 15.43563307801378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 7.875, "completions/mean_terminated_length": 4.695652484893799, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.0461532920598984, "epoch": 0.00921, "frac_reward_zero_std": 0.0, "grad_norm": 0.0399336963891983, "kl": 0.22844034433364868, "learning_rate": 7.99971002365309e-06, "loss": -0.0847, "num_tokens": 24545103.0, "reward": 0.4006335735321045, "reward_std": 1.3272666931152344, "rewards/rollout_reward_func/mean": 0.4006335735321045, "rewards/rollout_reward_func/std": 1.3272666931152344, "sampling/importance_sampling_ratio/max": 1.2470389604568481, "sampling/importance_sampling_ratio/mean": 0.5891244411468506, "sampling/importance_sampling_ratio/min": 2.1760315576102585e-06, "sampling/sampling_logp_difference/max": 2.3276262283325195, "sampling/sampling_logp_difference/mean": 0.3573727607727051, "step": 921, "step_time": 30.83936546499899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.048064537346363, "epoch": 0.00922, "grad_norm": 0.041666969656944275, "kl": 0.2276206575334072, "learning_rate": 7.999709367979666e-06, "loss": -0.0847, "step": 922, "step_time": 15.07629829698999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 4.869565486907959, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.532433658838272, "epoch": 0.00923, "frac_reward_zero_std": 0.0, "grad_norm": 0.1546103060245514, "kl": 0.19556695595383644, "learning_rate": 7.999708711565832e-06, "loss": -0.0884, "num_tokens": 24611792.0, "reward": 0.3504731357097626, "reward_std": 1.182801604270935, "rewards/rollout_reward_func/mean": 0.3504731357097626, "rewards/rollout_reward_func/std": 1.1828014850616455, "sampling/importance_sampling_ratio/max": 1.1879645586013794, "sampling/importance_sampling_ratio/mean": 0.6325125694274902, "sampling/importance_sampling_ratio/min": 1.783119873977057e-09, "sampling/sampling_logp_difference/max": 2.551168441772461, "sampling/sampling_logp_difference/mean": 0.45007309317588806, "step": 923, "step_time": 35.37642412398418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.539426140487194, "epoch": 0.00924, "grad_norm": 0.15148873627185822, "kl": 0.19090613443404436, "learning_rate": 7.999708054411592e-06, "loss": -0.0888, "step": 924, "step_time": 16.133072220982285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.625, "completions/mean_terminated_length": 4.833333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.861448141746223, "epoch": 0.00925, "frac_reward_zero_std": 0.0, "grad_norm": 0.09329520165920258, "kl": 0.4828461166471243, "learning_rate": 7.999707396516944e-06, "loss": -0.0993, "num_tokens": 24671417.0, "reward": 0.6828197240829468, "reward_std": 1.282874584197998, "rewards/rollout_reward_func/mean": 0.6828197240829468, "rewards/rollout_reward_func/std": 1.282874584197998, "sampling/importance_sampling_ratio/max": 1.4177286624908447, "sampling/importance_sampling_ratio/mean": 0.6732341051101685, "sampling/importance_sampling_ratio/min": 7.466634599495592e-08, "sampling/sampling_logp_difference/max": 2.4423434734344482, "sampling/sampling_logp_difference/mean": 0.3777209520339966, "step": 925, "step_time": 34.12997930501297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8675671806558967, "epoch": 0.00926, "grad_norm": 0.0880785807967186, "kl": 0.46228686161339283, "learning_rate": 7.999706737881888e-06, "loss": -0.0994, "step": 926, "step_time": 15.059602247012663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.40625, "completions/mean_terminated_length": 4.428571701049805, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9166956525295973, "epoch": 0.00927, "frac_reward_zero_std": 0.0, "grad_norm": 0.18240974843502045, "kl": 0.19723063800483942, "learning_rate": 7.999706078506426e-06, "loss": -0.0723, "num_tokens": 24728979.0, "reward": 0.182615727186203, "reward_std": 1.196736216545105, "rewards/rollout_reward_func/mean": 0.182615727186203, "rewards/rollout_reward_func/std": 1.1967360973358154, "sampling/importance_sampling_ratio/max": 1.167705774307251, "sampling/importance_sampling_ratio/mean": 0.5694420337677002, "sampling/importance_sampling_ratio/min": 4.215737135382369e-06, "sampling/sampling_logp_difference/max": 2.224590301513672, "sampling/sampling_logp_difference/mean": 0.2917684018611908, "step": 927, "step_time": 32.90565645399329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9098102021962404, "epoch": 0.00928, "grad_norm": 0.17588567733764648, "kl": 0.19919781992211938, "learning_rate": 7.999705418390558e-06, "loss": -0.0724, "step": 928, "step_time": 13.565923119007493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 8.71875, "completions/mean_terminated_length": 4.904761791229248, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.062144936993718, "epoch": 0.00929, "frac_reward_zero_std": 0.25, "grad_norm": 0.05650343373417854, "kl": 0.7763700017239898, "learning_rate": 7.999704757534282e-06, "loss": -0.0632, "num_tokens": 24783693.0, "reward": 0.7402233481407166, "reward_std": 1.3854597806930542, "rewards/rollout_reward_func/mean": 0.7402233481407166, "rewards/rollout_reward_func/std": 1.3854597806930542, "sampling/importance_sampling_ratio/max": 1.0756101608276367, "sampling/importance_sampling_ratio/mean": 0.5873128771781921, "sampling/importance_sampling_ratio/min": 3.6328461305856763e-07, "sampling/sampling_logp_difference/max": 1.928285837173462, "sampling/sampling_logp_difference/mean": 0.3779524564743042, "step": 929, "step_time": 34.86399676297151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.056062275543809, "epoch": 0.0093, "grad_norm": 0.05400136858224869, "kl": 0.7356720322277397, "learning_rate": 7.9997040959376e-06, "loss": -0.0633, "step": 930, "step_time": 16.09709104898502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8981682434678078, "epoch": 0.00931, "frac_reward_zero_std": 0.25, "grad_norm": 0.01711445488035679, "kl": 0.4074023365974426, "learning_rate": 7.999703433600512e-06, "loss": -0.0571, "num_tokens": 24837244.0, "reward": 1.0275938510894775, "reward_std": 1.184500813484192, "rewards/rollout_reward_func/mean": 1.0275938510894775, "rewards/rollout_reward_func/std": 1.1845009326934814, "sampling/importance_sampling_ratio/max": 1.2788927555084229, "sampling/importance_sampling_ratio/mean": 0.861725389957428, "sampling/importance_sampling_ratio/min": 2.6945399440592155e-05, "sampling/sampling_logp_difference/max": 2.131171226501465, "sampling/sampling_logp_difference/mean": 0.18638849258422852, "step": 931, "step_time": 29.209840037015965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8959943437948823, "epoch": 0.00932, "grad_norm": 0.016253262758255005, "kl": 0.3949806336313486, "learning_rate": 7.999702770523015e-06, "loss": -0.0571, "step": 932, "step_time": 14.532585918015684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.84375, "completions/mean_terminated_length": 4.65217399597168, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5791609082370996, "epoch": 0.00933, "frac_reward_zero_std": 0.0, "grad_norm": 0.041180722415447235, "kl": 0.22837333474308252, "learning_rate": 7.999702106705114e-06, "loss": -0.0784, "num_tokens": 24896799.0, "reward": -0.0012453310191631317, "reward_std": 1.1509310007095337, "rewards/rollout_reward_func/mean": -0.0012453310191631317, "rewards/rollout_reward_func/std": 1.1509311199188232, "sampling/importance_sampling_ratio/max": 1.2361987829208374, "sampling/importance_sampling_ratio/mean": 0.6492336988449097, "sampling/importance_sampling_ratio/min": 5.1014117161685135e-06, "sampling/sampling_logp_difference/max": 1.9115898609161377, "sampling/sampling_logp_difference/mean": 0.30514997243881226, "step": 933, "step_time": 33.999376281994046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.573991060256958, "epoch": 0.00934, "grad_norm": 0.04157725349068642, "kl": 0.22596718510612845, "learning_rate": 7.999701442146808e-06, "loss": -0.0786, "step": 934, "step_time": 13.736930778992246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.125, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.6425648164004087, "epoch": 0.00935, "frac_reward_zero_std": 0.0, "grad_norm": 0.0507049486041069, "kl": 0.2995260916650295, "learning_rate": 7.999700776848094e-06, "loss": -0.0346, "num_tokens": 24955241.0, "reward": 0.6963506937026978, "reward_std": 1.2151519060134888, "rewards/rollout_reward_func/mean": 0.6963506937026978, "rewards/rollout_reward_func/std": 1.2151517868041992, "sampling/importance_sampling_ratio/max": 1.4241998195648193, "sampling/importance_sampling_ratio/mean": 0.967537522315979, "sampling/importance_sampling_ratio/min": 7.080735667841509e-05, "sampling/sampling_logp_difference/max": 1.8092149496078491, "sampling/sampling_logp_difference/mean": 0.13370540738105774, "step": 935, "step_time": 29.607444651977858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6427227286621928, "epoch": 0.00936, "grad_norm": 0.050891101360321045, "kl": 0.29933728463947773, "learning_rate": 7.999700110808976e-06, "loss": -0.0346, "step": 936, "step_time": 13.810021028009942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 4.413793087005615, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.1778584523126483, "epoch": 0.00937, "frac_reward_zero_std": 0.5, "grad_norm": 0.028046337887644768, "kl": 0.3013693653047085, "learning_rate": 7.999699444029451e-06, "loss": -0.0508, "num_tokens": 24997693.0, "reward": 0.7014030814170837, "reward_std": 1.3291033506393433, "rewards/rollout_reward_func/mean": 0.7014030814170837, "rewards/rollout_reward_func/std": 1.3291033506393433, "sampling/importance_sampling_ratio/max": 1.1219836473464966, "sampling/importance_sampling_ratio/mean": 0.8152577877044678, "sampling/importance_sampling_ratio/min": 0.0002774182357825339, "sampling/sampling_logp_difference/max": 1.674425482749939, "sampling/sampling_logp_difference/mean": 0.20121657848358154, "step": 937, "step_time": 23.79996472899802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1786523750051856, "epoch": 0.00938, "grad_norm": 0.02744499407708645, "kl": 0.30518506839871407, "learning_rate": 7.999698776509521e-06, "loss": -0.0508, "step": 938, "step_time": 11.340665729992907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.84375, "completions/mean_terminated_length": 5.125, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.3376017659902573, "epoch": 0.00939, "frac_reward_zero_std": 0.0, "grad_norm": 0.14478051662445068, "kl": 0.9052713271230459, "learning_rate": 7.999698108249188e-06, "loss": -0.0666, "num_tokens": 25050009.0, "reward": 0.5245383381843567, "reward_std": 1.4063423871994019, "rewards/rollout_reward_func/mean": 0.5245383381843567, "rewards/rollout_reward_func/std": 1.4063423871994019, "sampling/importance_sampling_ratio/max": 1.1663485765457153, "sampling/importance_sampling_ratio/mean": 0.5831950902938843, "sampling/importance_sampling_ratio/min": 3.1666115773987258e-06, "sampling/sampling_logp_difference/max": 2.044560432434082, "sampling/sampling_logp_difference/mean": 0.4180234968662262, "step": 939, "step_time": 29.448089244018774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.3384078294038773, "epoch": 0.0094, "grad_norm": 0.1447863131761551, "kl": 0.8724635746330023, "learning_rate": 7.999697439248448e-06, "loss": -0.0669, "step": 940, "step_time": 12.704765294998651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.28125, "completions/mean_terminated_length": 5.666666507720947, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.585057906806469, "epoch": 0.00941, "frac_reward_zero_std": 0.25, "grad_norm": 0.09264456480741501, "kl": 0.4424054007977247, "learning_rate": 7.999696769507304e-06, "loss": -0.0601, "num_tokens": 25099420.0, "reward": 1.08864426612854, "reward_std": 1.197667121887207, "rewards/rollout_reward_func/mean": 1.08864426612854, "rewards/rollout_reward_func/std": 1.197667121887207, "sampling/importance_sampling_ratio/max": 1.2242786884307861, "sampling/importance_sampling_ratio/mean": 0.7200955152511597, "sampling/importance_sampling_ratio/min": 2.257107917102985e-05, "sampling/sampling_logp_difference/max": 2.7886977195739746, "sampling/sampling_logp_difference/mean": 0.3057543635368347, "step": 941, "step_time": 28.108719266980188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.589201571419835, "epoch": 0.00942, "grad_norm": 0.09066301584243774, "kl": 0.42194170597940683, "learning_rate": 7.999696099025756e-06, "loss": -0.0601, "step": 942, "step_time": 12.711074327977258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.09375, "completions/mean_terminated_length": 4.599999904632568, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.0737275099381804, "epoch": 0.00943, "frac_reward_zero_std": 0.25, "grad_norm": 0.0653194934129715, "kl": 0.5927111022174358, "learning_rate": 7.999695427803802e-06, "loss": -0.0546, "num_tokens": 25147290.0, "reward": 1.0209499597549438, "reward_std": 1.1698740720748901, "rewards/rollout_reward_func/mean": 1.0209499597549438, "rewards/rollout_reward_func/std": 1.1698739528656006, "sampling/importance_sampling_ratio/max": 1.1130934953689575, "sampling/importance_sampling_ratio/mean": 0.7008885145187378, "sampling/importance_sampling_ratio/min": 7.187457384816298e-08, "sampling/sampling_logp_difference/max": 1.8266485929489136, "sampling/sampling_logp_difference/mean": 0.41862815618515015, "step": 943, "step_time": 26.83562004099076 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 2.078069109469652, "epoch": 0.00944, "grad_norm": 0.05426570028066635, "kl": 0.539537325501442, "learning_rate": 7.999694755841444e-06, "loss": -0.055, "step": 944, "step_time": 13.972413437979412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 6.875, "completions/mean_terminated_length": 4.319999694824219, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.668678905814886, "epoch": 0.00945, "frac_reward_zero_std": 0.0, "grad_norm": 0.11039552837610245, "kl": 0.3751287180930376, "learning_rate": 7.999694083138682e-06, "loss": -0.0235, "num_tokens": 25198202.0, "reward": 0.3148307204246521, "reward_std": 1.3097703456878662, "rewards/rollout_reward_func/mean": 0.3148307204246521, "rewards/rollout_reward_func/std": 1.3097703456878662, "sampling/importance_sampling_ratio/max": 1.2535712718963623, "sampling/importance_sampling_ratio/mean": 0.7568086385726929, "sampling/importance_sampling_ratio/min": 2.9715732097201908e-09, "sampling/sampling_logp_difference/max": 2.0647315979003906, "sampling/sampling_logp_difference/mean": 0.35839754343032837, "step": 945, "step_time": 29.136944312020205 }, { "clip_ratio/high_max": 0.01105769257992506, "clip_ratio/high_mean": 0.00552884628996253, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00552884628996253, "entropy": 1.6807664362713695, "epoch": 0.00946, "grad_norm": 0.12474733591079712, "kl": 0.3484079521149397, "learning_rate": 7.999693409695516e-06, "loss": -0.0241, "step": 946, "step_time": 14.919241442985367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.03125, "completions/mean_terminated_length": 4.764706134796143, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.715912938117981, "epoch": 0.00947, "frac_reward_zero_std": 0.0, "grad_norm": 0.09965141862630844, "kl": 0.21396902296692133, "learning_rate": 7.999692735511946e-06, "loss": -0.0864, "num_tokens": 25262114.0, "reward": 0.028548412024974823, "reward_std": 1.0526235103607178, "rewards/rollout_reward_func/mean": 0.028548412024974823, "rewards/rollout_reward_func/std": 1.0526235103607178, "sampling/importance_sampling_ratio/max": 1.3175005912780762, "sampling/importance_sampling_ratio/mean": 0.4700472354888916, "sampling/importance_sampling_ratio/min": 3.174321605570185e-08, "sampling/sampling_logp_difference/max": 2.2383065223693848, "sampling/sampling_logp_difference/mean": 0.43049684166908264, "step": 947, "step_time": 30.762680625004577 }, { "clip_ratio/high_max": 0.0052083334885537624, "clip_ratio/high_mean": 0.0026041667442768812, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041667442768812, "entropy": 2.727844551205635, "epoch": 0.00948, "grad_norm": 0.10940086096525192, "kl": 0.19301608856767416, "learning_rate": 7.999692060587974e-06, "loss": -0.086, "step": 948, "step_time": 12.849741042009555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 10.3125, "completions/mean_terminated_length": 5.888888835906982, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.773027941584587, "epoch": 0.00949, "frac_reward_zero_std": 0.25, "grad_norm": 0.06066383048892021, "kl": 0.11198106873780489, "learning_rate": 7.999691384923597e-06, "loss": -0.0385, "num_tokens": 25319942.0, "reward": 0.1939554512500763, "reward_std": 1.34629225730896, "rewards/rollout_reward_func/mean": 0.1939554512500763, "rewards/rollout_reward_func/std": 1.34629225730896, "sampling/importance_sampling_ratio/max": 1.1241306066513062, "sampling/importance_sampling_ratio/mean": 0.4370458126068115, "sampling/importance_sampling_ratio/min": 5.053474524174817e-06, "sampling/sampling_logp_difference/max": 2.182562828063965, "sampling/sampling_logp_difference/mean": 0.41034603118896484, "step": 949, "step_time": 35.860425594990375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.7818062156438828, "epoch": 0.0095, "grad_norm": 0.06575727462768555, "kl": 0.11166798137128353, "learning_rate": 7.999690708518818e-06, "loss": -0.0385, "step": 950, "step_time": 14.543635146983434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.71875, "completions/mean_terminated_length": 5.400000095367432, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.07889211922884, "epoch": 0.00951, "frac_reward_zero_std": 0.0, "grad_norm": 0.2925131916999817, "kl": 0.21910883113741875, "learning_rate": 7.999690031373634e-06, "loss": -0.096, "num_tokens": 25368418.0, "reward": 0.7209814786911011, "reward_std": 1.342748761177063, "rewards/rollout_reward_func/mean": 0.7209814786911011, "rewards/rollout_reward_func/std": 1.3427486419677734, "sampling/importance_sampling_ratio/max": 1.2500427961349487, "sampling/importance_sampling_ratio/mean": 0.6204681396484375, "sampling/importance_sampling_ratio/min": 3.974432782882786e-09, "sampling/sampling_logp_difference/max": 2.1125919818878174, "sampling/sampling_logp_difference/mean": 0.36027634143829346, "step": 951, "step_time": 29.245748707995517 }, { "clip_ratio/high_max": 0.026988636702299118, "clip_ratio/high_mean": 0.013494318351149559, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013494318351149559, "entropy": 2.0613986402750015, "epoch": 0.00952, "grad_norm": 0.2562534213066101, "kl": 0.22373579628765583, "learning_rate": 7.999689353488049e-06, "loss": -0.0973, "step": 952, "step_time": 12.601176687021507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 11.09375, "completions/mean_terminated_length": 6.764706134796143, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.0718827694654465, "epoch": 0.00953, "frac_reward_zero_std": 0.0, "grad_norm": 0.045602355152368546, "kl": 0.149941336363554, "learning_rate": 7.99968867486206e-06, "loss": -0.0589, "num_tokens": 25426776.0, "reward": -0.38556015491485596, "reward_std": 0.9824117422103882, "rewards/rollout_reward_func/mean": -0.38556015491485596, "rewards/rollout_reward_func/std": 0.9824117422103882, "sampling/importance_sampling_ratio/max": 1.1985796689987183, "sampling/importance_sampling_ratio/mean": 0.3711455464363098, "sampling/importance_sampling_ratio/min": 2.9132266377018823e-08, "sampling/sampling_logp_difference/max": 2.3807666301727295, "sampling/sampling_logp_difference/mean": 0.48092418909072876, "step": 953, "step_time": 34.816342918013106 }, { "clip_ratio/high_max": 0.0062500000931322575, "clip_ratio/high_mean": 0.0031250000465661287, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 3.0695818215608597, "epoch": 0.00954, "grad_norm": 0.036293745040893555, "kl": 0.15784705709666014, "learning_rate": 7.999687995495668e-06, "loss": -0.0592, "step": 954, "step_time": 15.095075539997197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.03125, "completions/mean_terminated_length": 5.863636493682861, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.546536698937416, "epoch": 0.00955, "frac_reward_zero_std": 0.0, "grad_norm": 0.12790101766586304, "kl": 0.20529548591002822, "learning_rate": 7.999687315388874e-06, "loss": -0.0817, "num_tokens": 25481068.0, "reward": 0.10076618194580078, "reward_std": 1.3515299558639526, "rewards/rollout_reward_func/mean": 0.10076618194580078, "rewards/rollout_reward_func/std": 1.351529836654663, "sampling/importance_sampling_ratio/max": 1.501891851425171, "sampling/importance_sampling_ratio/mean": 0.4703505039215088, "sampling/importance_sampling_ratio/min": 1.0514190762478393e-05, "sampling/sampling_logp_difference/max": 1.9463962316513062, "sampling/sampling_logp_difference/mean": 0.35671985149383545, "step": 955, "step_time": 31.388089681990095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0052083334885537624, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "entropy": 2.5411238968372345, "epoch": 0.00956, "grad_norm": 0.11845842748880386, "kl": 0.21250802837312222, "learning_rate": 7.999686634541679e-06, "loss": -0.0824, "step": 956, "step_time": 13.65673059601977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.21875, "completions/mean_terminated_length": 5.192307949066162, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8860895186662674, "epoch": 0.00957, "frac_reward_zero_std": 0.25, "grad_norm": 0.035882700234651566, "kl": 0.3953614104539156, "learning_rate": 7.99968595295408e-06, "loss": -0.0553, "num_tokens": 25534560.0, "reward": 0.4300193190574646, "reward_std": 1.3751627206802368, "rewards/rollout_reward_func/mean": 0.4300193190574646, "rewards/rollout_reward_func/std": 1.3751627206802368, "sampling/importance_sampling_ratio/max": 1.4740073680877686, "sampling/importance_sampling_ratio/mean": 0.6968624591827393, "sampling/importance_sampling_ratio/min": 9.666635492067144e-08, "sampling/sampling_logp_difference/max": 2.6534042358398438, "sampling/sampling_logp_difference/mean": 0.34981435537338257, "step": 957, "step_time": 33.567939511995064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8817549720406532, "epoch": 0.00958, "grad_norm": 0.035147663205862045, "kl": 0.43640359584242105, "learning_rate": 7.99968527062608e-06, "loss": -0.0553, "step": 958, "step_time": 14.201696964009898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.40625, "completions/mean_terminated_length": 5.413793087005615, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6138723827898502, "epoch": 0.00959, "frac_reward_zero_std": 0.0, "grad_norm": 0.256778359413147, "kl": 0.4193998407572508, "learning_rate": 7.999684587557677e-06, "loss": -0.0392, "num_tokens": 25590170.0, "reward": 0.20660746097564697, "reward_std": 1.3692752122879028, "rewards/rollout_reward_func/mean": 0.20660746097564697, "rewards/rollout_reward_func/std": 1.3692752122879028, "sampling/importance_sampling_ratio/max": 1.2163532972335815, "sampling/importance_sampling_ratio/mean": 0.7189469337463379, "sampling/importance_sampling_ratio/min": 2.2898408169602646e-10, "sampling/sampling_logp_difference/max": 2.5762979984283447, "sampling/sampling_logp_difference/mean": 0.3382749557495117, "step": 959, "step_time": 27.591102831996977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.01875000004656613, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01875000004656613, "entropy": 1.6159399338066578, "epoch": 0.0096, "grad_norm": 0.17196497321128845, "kl": 0.43382027745246887, "learning_rate": 7.999683903748873e-06, "loss": -0.0398, "step": 960, "step_time": 13.466249272969435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.96875, "completions/mean_terminated_length": 4.884615421295166, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4679371789097786, "epoch": 0.00961, "frac_reward_zero_std": 0.0, "grad_norm": 0.13279996812343597, "kl": 0.3432228257879615, "learning_rate": 7.99968321919967e-06, "loss": -0.0554, "num_tokens": 25645998.0, "reward": 0.17777422070503235, "reward_std": 1.2623496055603027, "rewards/rollout_reward_func/mean": 0.17777422070503235, "rewards/rollout_reward_func/std": 1.2623494863510132, "sampling/importance_sampling_ratio/max": 1.1885998249053955, "sampling/importance_sampling_ratio/mean": 0.743209183216095, "sampling/importance_sampling_ratio/min": 2.7192563720745966e-06, "sampling/sampling_logp_difference/max": 2.0267794132232666, "sampling/sampling_logp_difference/mean": 0.3002822995185852, "step": 961, "step_time": 28.97584617798566 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "entropy": 1.4965261807665229, "epoch": 0.00962, "grad_norm": 0.03563860058784485, "kl": 0.3510380648076534, "learning_rate": 7.999682533910062e-06, "loss": -0.056, "step": 962, "step_time": 13.605209113025921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.40625, "completions/mean_terminated_length": 4.700000286102295, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8867812678217888, "epoch": 0.00963, "frac_reward_zero_std": 0.25, "grad_norm": 0.14317236840724945, "kl": 0.5478988476097584, "learning_rate": 7.999681847880053e-06, "loss": -0.0068, "num_tokens": 25688992.0, "reward": 0.7445400953292847, "reward_std": 1.364007830619812, "rewards/rollout_reward_func/mean": 0.7445400953292847, "rewards/rollout_reward_func/std": 1.3640077114105225, "sampling/importance_sampling_ratio/max": 1.4272516965866089, "sampling/importance_sampling_ratio/mean": 0.8390270471572876, "sampling/importance_sampling_ratio/min": 1.519785109849181e-05, "sampling/sampling_logp_difference/max": 2.219074249267578, "sampling/sampling_logp_difference/mean": 0.18800875544548035, "step": 963, "step_time": 26.457144289990538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.94257852435112, "epoch": 0.00964, "grad_norm": 0.17156988382339478, "kl": 0.5153191555291414, "learning_rate": 7.999681161109645e-06, "loss": -0.0079, "step": 964, "step_time": 13.717320576004568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 7.625, "completions/mean_terminated_length": 5.279999732971191, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9778896272182465, "epoch": 0.00965, "frac_reward_zero_std": 0.0, "grad_norm": 0.05542172119021416, "kl": 0.2899003755301237, "learning_rate": 7.999680473598836e-06, "loss": -0.0782, "num_tokens": 25745356.0, "reward": 0.6150712966918945, "reward_std": 1.2790164947509766, "rewards/rollout_reward_func/mean": 0.6150712966918945, "rewards/rollout_reward_func/std": 1.2790164947509766, "sampling/importance_sampling_ratio/max": 1.2389570474624634, "sampling/importance_sampling_ratio/mean": 0.637393057346344, "sampling/importance_sampling_ratio/min": 1.9012628399650566e-05, "sampling/sampling_logp_difference/max": 1.8310229778289795, "sampling/sampling_logp_difference/mean": 0.3428853750228882, "step": 965, "step_time": 31.329535630968167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 1.9929365664720535, "epoch": 0.00966, "grad_norm": 0.052182476967573166, "kl": 0.28599586337804794, "learning_rate": 7.999679785347625e-06, "loss": -0.0782, "step": 966, "step_time": 13.711270024985424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.71875, "completions/mean_terminated_length": 4.958333492279053, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.167470097541809, "epoch": 0.00967, "frac_reward_zero_std": 0.0, "grad_norm": 0.19699980318546295, "kl": 0.5148649658076465, "learning_rate": 7.999679096356015e-06, "loss": -0.0951, "num_tokens": 25809547.0, "reward": 0.283629447221756, "reward_std": 1.259626030921936, "rewards/rollout_reward_func/mean": 0.283629447221756, "rewards/rollout_reward_func/std": 1.259626030921936, "sampling/importance_sampling_ratio/max": 1.2576360702514648, "sampling/importance_sampling_ratio/mean": 0.5752649307250977, "sampling/importance_sampling_ratio/min": 3.7447676731972024e-05, "sampling/sampling_logp_difference/max": 2.479949474334717, "sampling/sampling_logp_difference/mean": 0.36026060581207275, "step": 967, "step_time": 34.11860416701529 }, { "clip_ratio/high_max": 0.0062500000931322575, "clip_ratio/high_mean": 0.0031250000465661287, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 2.175325572490692, "epoch": 0.00968, "grad_norm": 0.10752233862876892, "kl": 0.5322956377640367, "learning_rate": 7.999678406624005e-06, "loss": -0.0958, "step": 968, "step_time": 16.06897293101065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.325580656528473, "epoch": 0.00969, "frac_reward_zero_std": 0.25, "grad_norm": 0.04526427388191223, "kl": 0.3834415376186371, "learning_rate": 7.999677716151594e-06, "loss": -0.0609, "num_tokens": 25866039.0, "reward": 0.6230889558792114, "reward_std": 1.2987641096115112, "rewards/rollout_reward_func/mean": 0.6230889558792114, "rewards/rollout_reward_func/std": 1.2987641096115112, "sampling/importance_sampling_ratio/max": 1.3068872690200806, "sampling/importance_sampling_ratio/mean": 0.5662997961044312, "sampling/importance_sampling_ratio/min": 1.4263101810740864e-08, "sampling/sampling_logp_difference/max": 2.260037899017334, "sampling/sampling_logp_difference/mean": 0.442251592874527, "step": 969, "step_time": 33.09842256898992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.327080801129341, "epoch": 0.0097, "grad_norm": 0.04503623768687248, "kl": 0.3859186079353094, "learning_rate": 7.999677024938784e-06, "loss": -0.0609, "step": 970, "step_time": 16.15044532601314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 6.71875, "completions/mean_terminated_length": 4.576923370361328, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.734460100531578, "epoch": 0.00971, "frac_reward_zero_std": 0.0, "grad_norm": 0.20142658054828644, "kl": 0.3858963046222925, "learning_rate": 7.999676332985574e-06, "loss": -0.0668, "num_tokens": 25926905.0, "reward": -0.011712715029716492, "reward_std": 1.0948046445846558, "rewards/rollout_reward_func/mean": -0.011712715029716492, "rewards/rollout_reward_func/std": 1.0948046445846558, "sampling/importance_sampling_ratio/max": 1.7603000402450562, "sampling/importance_sampling_ratio/mean": 0.7194262742996216, "sampling/importance_sampling_ratio/min": 1.9778387922997354e-06, "sampling/sampling_logp_difference/max": 1.9794297218322754, "sampling/sampling_logp_difference/mean": 0.3564133644104004, "step": 971, "step_time": 27.817803218989866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 1.7443891912698746, "epoch": 0.00972, "grad_norm": 0.07442762702703476, "kl": 0.37994703091681004, "learning_rate": 7.999675640291963e-06, "loss": -0.0678, "step": 972, "step_time": 12.837789252022048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 8.78125, "completions/mean_terminated_length": 4.450000286102295, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.408658482134342, "epoch": 0.00973, "frac_reward_zero_std": 0.0, "grad_norm": 0.13814154267311096, "kl": 0.20748125691898167, "learning_rate": 7.999674946857955e-06, "loss": -0.0871, "num_tokens": 25986211.0, "reward": -0.08403095602989197, "reward_std": 1.1962437629699707, "rewards/rollout_reward_func/mean": -0.08403095602989197, "rewards/rollout_reward_func/std": 1.1962437629699707, "sampling/importance_sampling_ratio/max": 1.103391170501709, "sampling/importance_sampling_ratio/mean": 0.5523411631584167, "sampling/importance_sampling_ratio/min": 9.699140690599961e-08, "sampling/sampling_logp_difference/max": 2.179478645324707, "sampling/sampling_logp_difference/mean": 0.4545528292655945, "step": 973, "step_time": 34.59650938700361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 2.4140416011214256, "epoch": 0.00974, "grad_norm": 0.11320670694112778, "kl": 0.2173500747885555, "learning_rate": 7.999674252683546e-06, "loss": -0.0874, "step": 974, "step_time": 15.19594416100881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.78125, "completions/mean_terminated_length": 5.0416669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9614399839192629, "epoch": 0.00975, "frac_reward_zero_std": 0.0, "grad_norm": 0.06218063086271286, "kl": 0.2795975902117789, "learning_rate": 7.999673557768738e-06, "loss": -0.0615, "num_tokens": 26042347.0, "reward": 0.03123527765274048, "reward_std": 1.0803732872009277, "rewards/rollout_reward_func/mean": 0.03123527765274048, "rewards/rollout_reward_func/std": 1.0803732872009277, "sampling/importance_sampling_ratio/max": 1.1793979406356812, "sampling/importance_sampling_ratio/mean": 0.6178420782089233, "sampling/importance_sampling_ratio/min": 3.680276449813391e-06, "sampling/sampling_logp_difference/max": 1.9849858283996582, "sampling/sampling_logp_difference/mean": 0.37694481015205383, "step": 975, "step_time": 29.401380306007923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 1.9781784415245056, "epoch": 0.00976, "grad_norm": 0.04923402518033981, "kl": 0.2803101600147784, "learning_rate": 7.999672862113531e-06, "loss": -0.0617, "step": 976, "step_time": 13.609780161990784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 8.3125, "completions/mean_terminated_length": 4.285714149475098, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9338024146854877, "epoch": 0.00977, "frac_reward_zero_std": 0.0, "grad_norm": 0.18542855978012085, "kl": 0.2306760773062706, "learning_rate": 7.999672165717924e-06, "loss": -0.0735, "num_tokens": 26105414.0, "reward": 0.48960939049720764, "reward_std": 1.298110842704773, "rewards/rollout_reward_func/mean": 0.48960939049720764, "rewards/rollout_reward_func/std": 1.298110842704773, "sampling/importance_sampling_ratio/max": 1.1447322368621826, "sampling/importance_sampling_ratio/mean": 0.5590089559555054, "sampling/importance_sampling_ratio/min": 7.650366740108439e-08, "sampling/sampling_logp_difference/max": 2.416210174560547, "sampling/sampling_logp_difference/mean": 0.352323055267334, "step": 977, "step_time": 36.492340493015945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9285505712032318, "epoch": 0.00978, "grad_norm": 0.18042606115341187, "kl": 0.22858326882123947, "learning_rate": 7.999671468581921e-06, "loss": -0.0742, "step": 978, "step_time": 16.48824280902045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.0625, "completions/mean_terminated_length": 5.909090995788574, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.4905280619859695, "epoch": 0.00979, "frac_reward_zero_std": 0.0, "grad_norm": 0.04784020036458969, "kl": 0.5820831968449056, "learning_rate": 7.99967077070552e-06, "loss": -0.0776, "num_tokens": 26160870.0, "reward": 0.14901432394981384, "reward_std": 1.439247488975525, "rewards/rollout_reward_func/mean": 0.14901432394981384, "rewards/rollout_reward_func/std": 1.439247488975525, "sampling/importance_sampling_ratio/max": 1.1154248714447021, "sampling/importance_sampling_ratio/mean": 0.44336017966270447, "sampling/importance_sampling_ratio/min": 1.16017336040386e-05, "sampling/sampling_logp_difference/max": 2.4272000789642334, "sampling/sampling_logp_difference/mean": 0.395368754863739, "step": 979, "step_time": 30.67237798102724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.4929366558790207, "epoch": 0.0098, "grad_norm": 0.04595088213682175, "kl": 0.5685934708453715, "learning_rate": 7.999670072088718e-06, "loss": -0.0776, "step": 980, "step_time": 14.05121903399413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 8.8125, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.054695203900337, "epoch": 0.00981, "frac_reward_zero_std": 0.0, "grad_norm": 0.05081651359796524, "kl": 0.35927942860871553, "learning_rate": 7.999669372731521e-06, "loss": -0.0529, "num_tokens": 26211521.0, "reward": 0.5168423652648926, "reward_std": 1.3783460855484009, "rewards/rollout_reward_func/mean": 0.5168423652648926, "rewards/rollout_reward_func/std": 1.3783462047576904, "sampling/importance_sampling_ratio/max": 1.0595628023147583, "sampling/importance_sampling_ratio/mean": 0.5333484411239624, "sampling/importance_sampling_ratio/min": 1.0658214222303286e-07, "sampling/sampling_logp_difference/max": 2.253711700439453, "sampling/sampling_logp_difference/mean": 0.314602792263031, "step": 981, "step_time": 35.0717741370172 }, { "clip_ratio/high_max": 0.0062500000931322575, "clip_ratio/high_mean": 0.0031250000465661287, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 2.055865630507469, "epoch": 0.00982, "grad_norm": 0.023802539333701134, "kl": 0.3515664655715227, "learning_rate": 7.999668672633923e-06, "loss": -0.053, "step": 982, "step_time": 13.659013259995845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 10.40625, "completions/mean_terminated_length": 6.5789475440979, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.9513139575719833, "epoch": 0.00983, "frac_reward_zero_std": 0.0, "grad_norm": 0.22521570324897766, "kl": 0.3488110643811524, "learning_rate": 7.99966797179593e-06, "loss": -0.0476, "num_tokens": 26272213.0, "reward": -0.45944729447364807, "reward_std": 0.8695845007896423, "rewards/rollout_reward_func/mean": -0.45944729447364807, "rewards/rollout_reward_func/std": 0.8695845007896423, "sampling/importance_sampling_ratio/max": 1.2169010639190674, "sampling/importance_sampling_ratio/mean": 0.3239288032054901, "sampling/importance_sampling_ratio/min": 7.31132513465127e-06, "sampling/sampling_logp_difference/max": 1.911034107208252, "sampling/sampling_logp_difference/mean": 0.44172734022140503, "step": 983, "step_time": 37.07871621499362 }, { "clip_ratio/high_max": 0.0062500000931322575, "clip_ratio/high_mean": 0.0031250000465661287, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 2.939209833741188, "epoch": 0.00984, "grad_norm": 0.07393743097782135, "kl": 0.34173127729445696, "learning_rate": 7.999667270217539e-06, "loss": -0.0481, "step": 984, "step_time": 14.551024527012487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 8.96875, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.3298188522458076, "epoch": 0.00985, "frac_reward_zero_std": 0.0, "grad_norm": 0.045934420078992844, "kl": 0.2689575804397464, "learning_rate": 7.99966656789875e-06, "loss": -0.0709, "num_tokens": 26333339.0, "reward": -0.26630693674087524, "reward_std": 1.0118073225021362, "rewards/rollout_reward_func/mean": -0.26630693674087524, "rewards/rollout_reward_func/std": 1.0118073225021362, "sampling/importance_sampling_ratio/max": 1.1992723941802979, "sampling/importance_sampling_ratio/mean": 0.4704504609107971, "sampling/importance_sampling_ratio/min": 1.9088925000687595e-06, "sampling/sampling_logp_difference/max": 2.076582431793213, "sampling/sampling_logp_difference/mean": 0.40947073698043823, "step": 985, "step_time": 32.44564305599488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.327690666541457, "epoch": 0.00986, "grad_norm": 0.052706051617860794, "kl": 0.2530996333807707, "learning_rate": 7.999665864839563e-06, "loss": -0.0709, "step": 986, "step_time": 14.166959244015743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.53125, "completions/mean_terminated_length": 4.708333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9570505004376173, "epoch": 0.00987, "frac_reward_zero_std": 0.25, "grad_norm": 0.05272751301527023, "kl": 0.2589465966448188, "learning_rate": 7.999665161039979e-06, "loss": -0.0281, "num_tokens": 26383870.0, "reward": 0.05493438243865967, "reward_std": 1.2597922086715698, "rewards/rollout_reward_func/mean": 0.05493438243865967, "rewards/rollout_reward_func/std": 1.2597922086715698, "sampling/importance_sampling_ratio/max": 1.170157551765442, "sampling/importance_sampling_ratio/mean": 0.639660120010376, "sampling/importance_sampling_ratio/min": 2.339023552622166e-07, "sampling/sampling_logp_difference/max": 2.775130033493042, "sampling/sampling_logp_difference/mean": 0.345525860786438, "step": 987, "step_time": 30.184413627983304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9577795304358006, "epoch": 0.00988, "grad_norm": 0.05372180789709091, "kl": 0.2664063908159733, "learning_rate": 7.9996644565e-06, "loss": -0.0281, "step": 988, "step_time": 15.244207506999373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.65625, "completions/mean_terminated_length": 5.319999694824219, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9816566556692123, "epoch": 0.00989, "frac_reward_zero_std": 0.0, "grad_norm": 0.09228627383708954, "kl": 0.3022285159677267, "learning_rate": 7.999663751219622e-06, "loss": -0.0626, "num_tokens": 26441454.0, "reward": 0.5820086002349854, "reward_std": 1.2895894050598145, "rewards/rollout_reward_func/mean": 0.5820086002349854, "rewards/rollout_reward_func/std": 1.289589285850525, "sampling/importance_sampling_ratio/max": 1.1913833618164062, "sampling/importance_sampling_ratio/mean": 0.6371623873710632, "sampling/importance_sampling_ratio/min": 1.2989006563657313e-06, "sampling/sampling_logp_difference/max": 1.882151484489441, "sampling/sampling_logp_difference/mean": 0.3250436782836914, "step": 989, "step_time": 30.7386723109812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9727763384580612, "epoch": 0.0099, "grad_norm": 0.08560820668935776, "kl": 0.29503010120242834, "learning_rate": 7.99966304519885e-06, "loss": -0.0627, "step": 990, "step_time": 14.565496740993694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 8.46875, "completions/mean_terminated_length": 5.958333492279053, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2626823335886, "epoch": 0.00991, "frac_reward_zero_std": 0.0, "grad_norm": 0.06354321539402008, "kl": 0.23340066522359848, "learning_rate": 7.99966233843768e-06, "loss": -0.0946, "num_tokens": 26506096.0, "reward": 0.025303542613983154, "reward_std": 1.1533840894699097, "rewards/rollout_reward_func/mean": 0.025303542613983154, "rewards/rollout_reward_func/std": 1.1533839702606201, "sampling/importance_sampling_ratio/max": 1.39237642288208, "sampling/importance_sampling_ratio/mean": 0.47182348370552063, "sampling/importance_sampling_ratio/min": 0.0001329350779997185, "sampling/sampling_logp_difference/max": 2.113734006881714, "sampling/sampling_logp_difference/mean": 0.38843023777008057, "step": 991, "step_time": 35.929327324964106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.257726937532425, "epoch": 0.00992, "grad_norm": 0.0634373277425766, "kl": 0.2354363091289997, "learning_rate": 7.999661630936115e-06, "loss": -0.0947, "step": 992, "step_time": 14.487847028009128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 7.5, "completions/mean_terminated_length": 4.17391300201416, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.705787718296051, "epoch": 0.00993, "frac_reward_zero_std": 0.5, "grad_norm": 0.016684595495462418, "kl": 0.20341347064822912, "learning_rate": 7.999660922694153e-06, "loss": -0.0243, "num_tokens": 26555132.0, "reward": 0.8637192845344543, "reward_std": 1.3998478651046753, "rewards/rollout_reward_func/mean": 0.8637192845344543, "rewards/rollout_reward_func/std": 1.3998479843139648, "sampling/importance_sampling_ratio/max": 1.172967791557312, "sampling/importance_sampling_ratio/mean": 0.6931151151657104, "sampling/importance_sampling_ratio/min": 6.887871677463409e-07, "sampling/sampling_logp_difference/max": 1.9917153120040894, "sampling/sampling_logp_difference/mean": 0.30870264768600464, "step": 993, "step_time": 29.80635860102484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7063838057219982, "epoch": 0.00994, "grad_norm": 0.0173676498234272, "kl": 0.20114655327051878, "learning_rate": 7.999660213711798e-06, "loss": -0.0242, "step": 994, "step_time": 13.735247567019542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 10.34375, "completions/mean_terminated_length": 6.473684310913086, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.753773733973503, "epoch": 0.00995, "frac_reward_zero_std": 0.0, "grad_norm": 0.04814927652478218, "kl": 0.1457544551230967, "learning_rate": 7.999659503989044e-06, "loss": -0.0936, "num_tokens": 26625661.0, "reward": 0.1794809252023697, "reward_std": 1.2759028673171997, "rewards/rollout_reward_func/mean": 0.1794809252023697, "rewards/rollout_reward_func/std": 1.2759028673171997, "sampling/importance_sampling_ratio/max": 1.1765414476394653, "sampling/importance_sampling_ratio/mean": 0.4322810769081116, "sampling/importance_sampling_ratio/min": 7.756603137565321e-10, "sampling/sampling_logp_difference/max": 2.2838964462280273, "sampling/sampling_logp_difference/mean": 0.4720257520675659, "step": 995, "step_time": 36.53316618698591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.748581677675247, "epoch": 0.00996, "grad_norm": 0.0483061745762825, "kl": 0.14541599061340094, "learning_rate": 7.999658793525895e-06, "loss": -0.0937, "step": 996, "step_time": 16.32961933698971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 8.8125, "completions/mean_terminated_length": 5.047619342803955, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.6668468080461025, "epoch": 0.00997, "frac_reward_zero_std": 0.0, "grad_norm": 0.10714829713106155, "kl": 0.1814994621090591, "learning_rate": 7.999658082322352e-06, "loss": -0.0738, "num_tokens": 26680400.0, "reward": 0.14394700527191162, "reward_std": 1.3122618198394775, "rewards/rollout_reward_func/mean": 0.14394700527191162, "rewards/rollout_reward_func/std": 1.3122618198394775, "sampling/importance_sampling_ratio/max": 1.202810525894165, "sampling/importance_sampling_ratio/mean": 0.5121502876281738, "sampling/importance_sampling_ratio/min": 1.3329056969269004e-07, "sampling/sampling_logp_difference/max": 2.348619222640991, "sampling/sampling_logp_difference/mean": 0.47375398874282837, "step": 997, "step_time": 31.487117097974988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.673160906881094, "epoch": 0.00998, "grad_norm": 0.09782402217388153, "kl": 0.18197906576097012, "learning_rate": 7.999657370378414e-06, "loss": -0.074, "step": 998, "step_time": 13.3041936459922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.25, "completions/mean_terminated_length": 4.799999713897705, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4522488196380436, "epoch": 0.00999, "frac_reward_zero_std": 0.0, "grad_norm": 0.14176252484321594, "kl": 0.21348614362068474, "learning_rate": 7.999656657694079e-06, "loss": -0.0574, "num_tokens": 26736742.0, "reward": 0.47785818576812744, "reward_std": 1.3368935585021973, "rewards/rollout_reward_func/mean": 0.47785818576812744, "rewards/rollout_reward_func/std": 1.3368935585021973, "sampling/importance_sampling_ratio/max": 1.4572187662124634, "sampling/importance_sampling_ratio/mean": 0.7719601392745972, "sampling/importance_sampling_ratio/min": 7.055588866933249e-06, "sampling/sampling_logp_difference/max": 1.924963116645813, "sampling/sampling_logp_difference/mean": 0.2869594097137451, "step": 999, "step_time": 30.811249799036887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4525894452817738, "epoch": 0.01, "grad_norm": 0.16767258942127228, "kl": 0.21274060918949544, "learning_rate": 7.99965594426935e-06, "loss": -0.0583, "step": 1000, "step_time": 13.74612928199349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.9375, "completions/mean_terminated_length": 4.400000095367432, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6497937738895416, "epoch": 0.01001, "frac_reward_zero_std": 0.0, "grad_norm": 0.22592486441135406, "kl": 0.23982124868780375, "learning_rate": 7.999655230104228e-06, "loss": -0.0313, "num_tokens": 26800542.0, "reward": -0.15862542390823364, "reward_std": 1.0080255270004272, "rewards/rollout_reward_func/mean": -0.15862542390823364, "rewards/rollout_reward_func/std": 1.0080254077911377, "sampling/importance_sampling_ratio/max": 1.4624090194702148, "sampling/importance_sampling_ratio/mean": 0.823127806186676, "sampling/importance_sampling_ratio/min": 2.0472015194172855e-09, "sampling/sampling_logp_difference/max": 2.269745349884033, "sampling/sampling_logp_difference/mean": 0.3530118763446808, "step": 1001, "step_time": 32.24561068000912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00937500037252903, "clip_ratio/low_min": 0.0062500000931322575, "clip_ratio/region_mean": 0.00937500037252903, "entropy": 1.679136361926794, "epoch": 0.01002, "grad_norm": 0.215828999876976, "kl": 0.23348545283079147, "learning_rate": 7.999654515198711e-06, "loss": -0.034, "step": 1002, "step_time": 15.263299083002494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 10.03125, "completions/mean_terminated_length": 5.947368621826172, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.736364558339119, "epoch": 0.01003, "frac_reward_zero_std": 0.0, "grad_norm": 0.032819800078868866, "kl": 0.31415193155407906, "learning_rate": 7.999653799552799e-06, "loss": -0.0756, "num_tokens": 26860569.0, "reward": 0.3202052414417267, "reward_std": 1.3261005878448486, "rewards/rollout_reward_func/mean": 0.3202052414417267, "rewards/rollout_reward_func/std": 1.326100468635559, "sampling/importance_sampling_ratio/max": 1.0889962911605835, "sampling/importance_sampling_ratio/mean": 0.3993526101112366, "sampling/importance_sampling_ratio/min": 4.349859921148891e-07, "sampling/sampling_logp_difference/max": 2.454022169113159, "sampling/sampling_logp_difference/mean": 0.4808812141418457, "step": 1003, "step_time": 37.24088962098176 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "entropy": 2.742296740412712, "epoch": 0.01004, "grad_norm": 0.03197595104575157, "kl": 0.2564265257678926, "learning_rate": 7.999653083166494e-06, "loss": -0.0757, "step": 1004, "step_time": 15.180069894020562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 9.03125, "completions/mean_terminated_length": 5.863636493682861, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.380698537454009, "epoch": 0.01005, "frac_reward_zero_std": 0.25, "grad_norm": 0.06166629120707512, "kl": 0.13953124498948455, "learning_rate": 7.999652366039795e-06, "loss": -0.0399, "num_tokens": 26915744.0, "reward": 0.08720260858535767, "reward_std": 1.238685965538025, "rewards/rollout_reward_func/mean": 0.08720260858535767, "rewards/rollout_reward_func/std": 1.2386858463287354, "sampling/importance_sampling_ratio/max": 1.1408218145370483, "sampling/importance_sampling_ratio/mean": 0.5670903921127319, "sampling/importance_sampling_ratio/min": 6.655351398876519e-07, "sampling/sampling_logp_difference/max": 2.108792543411255, "sampling/sampling_logp_difference/mean": 0.39296799898147583, "step": 1005, "step_time": 34.01630554300209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004464285913854837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "entropy": 2.3799136765301228, "epoch": 0.01006, "grad_norm": 0.0691564604640007, "kl": 0.14942103018984199, "learning_rate": 7.999651648172701e-06, "loss": -0.0397, "step": 1006, "step_time": 15.11790722499427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 5.599999904632568, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.4092110991477966, "epoch": 0.01007, "frac_reward_zero_std": 0.0, "grad_norm": 0.2329365611076355, "kl": 0.18026682594791055, "learning_rate": 7.999650929565216e-06, "loss": -0.0827, "num_tokens": 26969515.0, "reward": 0.04484573006629944, "reward_std": 1.2537946701049805, "rewards/rollout_reward_func/mean": 0.04484573006629944, "rewards/rollout_reward_func/std": 1.2537946701049805, "sampling/importance_sampling_ratio/max": 1.2572871446609497, "sampling/importance_sampling_ratio/mean": 0.49652767181396484, "sampling/importance_sampling_ratio/min": 5.143296675669262e-06, "sampling/sampling_logp_difference/max": 2.151515007019043, "sampling/sampling_logp_difference/mean": 0.4005265235900879, "step": 1007, "step_time": 31.547393607033882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 2.4064677208662033, "epoch": 0.01008, "grad_norm": 0.21790480613708496, "kl": 0.17884811107069254, "learning_rate": 7.999650210217335e-06, "loss": -0.0833, "step": 1008, "step_time": 14.350208507996285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.59375, "completions/mean_terminated_length": 6.68181848526001, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.4121427410282195, "epoch": 0.01009, "frac_reward_zero_std": 0.0, "grad_norm": 0.14116789400577545, "kl": 0.3727097101509571, "learning_rate": 7.999649490129062e-06, "loss": -0.1132, "num_tokens": 27037131.0, "reward": 0.3515404760837555, "reward_std": 1.2547779083251953, "rewards/rollout_reward_func/mean": 0.3515404760837555, "rewards/rollout_reward_func/std": 1.2547779083251953, "sampling/importance_sampling_ratio/max": 1.3843824863433838, "sampling/importance_sampling_ratio/mean": 0.5120716094970703, "sampling/importance_sampling_ratio/min": 2.496278284525033e-05, "sampling/sampling_logp_difference/max": 1.9775464534759521, "sampling/sampling_logp_difference/mean": 0.39525049924850464, "step": 1009, "step_time": 35.758976320023066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0029761905316263437, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0029761905316263437, "entropy": 2.4087158665060997, "epoch": 0.0101, "grad_norm": 0.1127839908003807, "kl": 0.37555689131841063, "learning_rate": 7.999648769300397e-06, "loss": -0.1139, "step": 1010, "step_time": 14.356578210994485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5625, "completions/mean_terminated_length": 4.6666669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.1249947058968246, "epoch": 0.01011, "frac_reward_zero_std": 0.0, "grad_norm": 0.0740278884768486, "kl": 0.2545004873536527, "learning_rate": 7.999648047731338e-06, "loss": -0.0762, "num_tokens": 27098020.0, "reward": 0.0730387270450592, "reward_std": 1.1854772567749023, "rewards/rollout_reward_func/mean": 0.0730387270450592, "rewards/rollout_reward_func/std": 1.1854771375656128, "sampling/importance_sampling_ratio/max": 1.2273831367492676, "sampling/importance_sampling_ratio/mean": 0.5372896194458008, "sampling/importance_sampling_ratio/min": 9.390961963617883e-08, "sampling/sampling_logp_difference/max": 1.9528656005859375, "sampling/sampling_logp_difference/mean": 0.35754531621932983, "step": 1011, "step_time": 33.46314969997911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.1092260610312223, "epoch": 0.01012, "grad_norm": 0.06916756927967072, "kl": 0.25289271771907806, "learning_rate": 7.999647325421885e-06, "loss": -0.0764, "step": 1012, "step_time": 16.021245464027743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.4375, "completions/mean_terminated_length": 5.851851940155029, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.945422409567982, "epoch": 0.01013, "frac_reward_zero_std": 0.5, "grad_norm": 0.005977428983896971, "kl": 0.2011596318334341, "learning_rate": 7.999646602372042e-06, "loss": -0.0475, "num_tokens": 27143881.0, "reward": 0.912321925163269, "reward_std": 1.4834195375442505, "rewards/rollout_reward_func/mean": 0.912321925163269, "rewards/rollout_reward_func/std": 1.4834195375442505, "sampling/importance_sampling_ratio/max": 1.0631951093673706, "sampling/importance_sampling_ratio/mean": 0.6820614337921143, "sampling/importance_sampling_ratio/min": 5.598486154667626e-07, "sampling/sampling_logp_difference/max": 1.7502504587173462, "sampling/sampling_logp_difference/mean": 0.3430666923522949, "step": 1013, "step_time": 23.500897703997907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "entropy": 1.9433727287687361, "epoch": 0.01014, "grad_norm": 0.006189399864524603, "kl": 0.20459091663360596, "learning_rate": 7.999645878581808e-06, "loss": -0.0475, "step": 1014, "step_time": 12.005871608984307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.53125, "completions/mean_terminated_length": 4.34615421295166, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.293546272907406, "epoch": 0.01015, "frac_reward_zero_std": 0.25, "grad_norm": 0.2206188291311264, "kl": 0.38738410733640194, "learning_rate": 7.99964515405118e-06, "loss": -0.0595, "num_tokens": 27200940.0, "reward": 1.187397837638855, "reward_std": 1.1440094709396362, "rewards/rollout_reward_func/mean": 1.187397837638855, "rewards/rollout_reward_func/std": 1.1440093517303467, "sampling/importance_sampling_ratio/max": 1.1749341487884521, "sampling/importance_sampling_ratio/mean": 0.7479914426803589, "sampling/importance_sampling_ratio/min": 3.589046900742687e-05, "sampling/sampling_logp_difference/max": 1.5608117580413818, "sampling/sampling_logp_difference/mean": 0.23663797974586487, "step": 1015, "step_time": 33.832923422014574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 1.2893106220290065, "epoch": 0.01016, "grad_norm": 0.07224729657173157, "kl": 0.39751339331269264, "learning_rate": 7.999644428780159e-06, "loss": -0.0603, "step": 1016, "step_time": 16.821001569973305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.8125, "completions/mean_terminated_length": 4.758620738983154, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3250736554618925, "epoch": 0.01017, "frac_reward_zero_std": 0.0, "grad_norm": 0.06925202906131744, "kl": 0.4737748056650162, "learning_rate": 7.999643702768747e-06, "loss": -0.0636, "num_tokens": 27256046.0, "reward": 0.7384852766990662, "reward_std": 1.2797011137008667, "rewards/rollout_reward_func/mean": 0.7384852766990662, "rewards/rollout_reward_func/std": 1.2797011137008667, "sampling/importance_sampling_ratio/max": 1.1578633785247803, "sampling/importance_sampling_ratio/mean": 0.79621422290802, "sampling/importance_sampling_ratio/min": 0.00011976480163866654, "sampling/sampling_logp_difference/max": 1.916519045829773, "sampling/sampling_logp_difference/mean": 0.2589837312698364, "step": 1017, "step_time": 26.08950541097147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3183885499602184, "epoch": 0.01018, "grad_norm": 0.07336552441120148, "kl": 0.4893413819372654, "learning_rate": 7.999642976016945e-06, "loss": -0.0635, "step": 1018, "step_time": 12.649666750017786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.96875, "completions/mean_terminated_length": 4.884615421295166, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5713036553934216, "epoch": 0.01019, "frac_reward_zero_std": 0.5, "grad_norm": 0.02299313247203827, "kl": 0.21880314406007528, "learning_rate": 7.99964224852475e-06, "loss": -0.0464, "num_tokens": 27306171.0, "reward": 1.1969469785690308, "reward_std": 1.22885000705719, "rewards/rollout_reward_func/mean": 1.1969469785690308, "rewards/rollout_reward_func/std": 1.22885000705719, "sampling/importance_sampling_ratio/max": 1.2053955793380737, "sampling/importance_sampling_ratio/mean": 0.7122412919998169, "sampling/importance_sampling_ratio/min": 1.8091614037984982e-05, "sampling/sampling_logp_difference/max": 1.6386752128601074, "sampling/sampling_logp_difference/mean": 0.2863623797893524, "step": 1019, "step_time": 29.510731993010268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5693628881126642, "epoch": 0.0102, "grad_norm": 0.023446977138519287, "kl": 0.2177998162806034, "learning_rate": 7.999641520292164e-06, "loss": -0.0464, "step": 1020, "step_time": 13.956335918031982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9600920146331191, "epoch": 0.01021, "frac_reward_zero_std": 0.5, "grad_norm": 0.018993711099028587, "kl": 0.2752886191010475, "learning_rate": 7.999640791319187e-06, "loss": -0.0409, "num_tokens": 27356288.0, "reward": 1.1701492071151733, "reward_std": 1.0564454793930054, "rewards/rollout_reward_func/mean": 1.1701492071151733, "rewards/rollout_reward_func/std": 1.0564454793930054, "sampling/importance_sampling_ratio/max": 1.0762608051300049, "sampling/importance_sampling_ratio/mean": 0.8025164008140564, "sampling/importance_sampling_ratio/min": 4.454627924133092e-05, "sampling/sampling_logp_difference/max": 1.6698811054229736, "sampling/sampling_logp_difference/mean": 0.15690913796424866, "step": 1021, "step_time": 29.507187976050773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9568075332790613, "epoch": 0.01022, "grad_norm": 0.019372155889868736, "kl": 0.27764279022812843, "learning_rate": 7.999640061605819e-06, "loss": -0.0409, "step": 1022, "step_time": 14.279754260991467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.09375, "completions/mean_terminated_length": 4.366666793823242, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.7515287427231669, "epoch": 0.01023, "frac_reward_zero_std": 0.25, "grad_norm": 0.06591179221868515, "kl": 0.5019235834479332, "learning_rate": 7.999639331152063e-06, "loss": -0.0551, "num_tokens": 27400111.0, "reward": 1.1759527921676636, "reward_std": 1.181251049041748, "rewards/rollout_reward_func/mean": 1.1759527921676636, "rewards/rollout_reward_func/std": 1.1812509298324585, "sampling/importance_sampling_ratio/max": 1.1422430276870728, "sampling/importance_sampling_ratio/mean": 0.8809300661087036, "sampling/importance_sampling_ratio/min": 4.0742292185314e-05, "sampling/sampling_logp_difference/max": 1.9564590454101562, "sampling/sampling_logp_difference/mean": 0.14974524080753326, "step": 1023, "step_time": 24.516924013005337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7459351979196072, "epoch": 0.01024, "grad_norm": 0.060339681804180145, "kl": 0.47494880110025406, "learning_rate": 7.999638599957913e-06, "loss": -0.0554, "step": 1024, "step_time": 13.822610232004081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.90625, "completions/mean_terminated_length": 4.464285850524902, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8431241353973746, "epoch": 0.01025, "frac_reward_zero_std": 0.5, "grad_norm": 0.07927088439464569, "kl": 0.22923245280981064, "learning_rate": 7.999637868023374e-06, "loss": -0.0216, "num_tokens": 27446963.0, "reward": 1.1812553405761719, "reward_std": 1.1859183311462402, "rewards/rollout_reward_func/mean": 1.1812553405761719, "rewards/rollout_reward_func/std": 1.1859183311462402, "sampling/importance_sampling_ratio/max": 1.134183406829834, "sampling/importance_sampling_ratio/mean": 0.8478965759277344, "sampling/importance_sampling_ratio/min": 0.00015423385775648057, "sampling/sampling_logp_difference/max": 1.7228740453720093, "sampling/sampling_logp_difference/mean": 0.16046136617660522, "step": 1025, "step_time": 25.773608822972164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005681818351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "entropy": 0.8384883448015898, "epoch": 0.01026, "grad_norm": 0.04176844283938408, "kl": 0.23561754263937473, "learning_rate": 7.999637135348445e-06, "loss": -0.0218, "step": 1026, "step_time": 12.248344399995403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.53125, "completions/mean_terminated_length": 5.159999847412109, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.8563206512480974, "epoch": 0.01027, "frac_reward_zero_std": 0.0, "grad_norm": 0.08134026825428009, "kl": 0.36738546937704086, "learning_rate": 7.999636401933128e-06, "loss": -0.0796, "num_tokens": 27503536.0, "reward": 0.8763458728790283, "reward_std": 1.2583309412002563, "rewards/rollout_reward_func/mean": 0.8763458728790283, "rewards/rollout_reward_func/std": 1.2583309412002563, "sampling/importance_sampling_ratio/max": 1.230198621749878, "sampling/importance_sampling_ratio/mean": 0.6440428495407104, "sampling/importance_sampling_ratio/min": 2.390281679254258e-06, "sampling/sampling_logp_difference/max": 1.910285234451294, "sampling/sampling_logp_difference/mean": 0.35671091079711914, "step": 1027, "step_time": 28.50745799500146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8604001249186695, "epoch": 0.01028, "grad_norm": 0.08009426295757294, "kl": 0.3573275711387396, "learning_rate": 7.999635667777419e-06, "loss": -0.08, "step": 1028, "step_time": 14.44923181500053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.15584907587617636, "epoch": 0.01029, "frac_reward_zero_std": 0.75, "grad_norm": 0.007661162409931421, "kl": 0.4426836371421814, "learning_rate": 7.99963493288132e-06, "loss": -0.0174, "num_tokens": 27539318.0, "reward": 0.156976580619812, "reward_std": 1.11002779006958, "rewards/rollout_reward_func/mean": 0.156976580619812, "rewards/rollout_reward_func/std": 1.11002779006958, "sampling/importance_sampling_ratio/max": 1.0523781776428223, "sampling/importance_sampling_ratio/mean": 0.9960047006607056, "sampling/importance_sampling_ratio/min": 0.06684396415948868, "sampling/sampling_logp_difference/max": 1.4899495840072632, "sampling/sampling_logp_difference/mean": 0.027471620589494705, "step": 1029, "step_time": 11.98393958498491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.15838427282869816, "epoch": 0.0103, "grad_norm": 0.007203384302556515, "kl": 0.4426336567848921, "learning_rate": 7.999634197244832e-06, "loss": -0.0174, "step": 1030, "step_time": 6.8121658739837585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 4.413793087005615, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.7277350868098438, "epoch": 0.01031, "frac_reward_zero_std": 0.25, "grad_norm": 0.09844344109296799, "kl": 0.21672626212239265, "learning_rate": 7.999633460867956e-06, "loss": -0.0456, "num_tokens": 27591792.0, "reward": 1.4643718004226685, "reward_std": 0.9044091105461121, "rewards/rollout_reward_func/mean": 1.4643718004226685, "rewards/rollout_reward_func/std": 0.9044091105461121, "sampling/importance_sampling_ratio/max": 1.1682430505752563, "sampling/importance_sampling_ratio/mean": 0.9109905362129211, "sampling/importance_sampling_ratio/min": 0.000679677992593497, "sampling/sampling_logp_difference/max": 1.2038397789001465, "sampling/sampling_logp_difference/mean": 0.12217972427606583, "step": 1031, "step_time": 27.497470793008688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7323969742283225, "epoch": 0.01032, "grad_norm": 0.12102352827787399, "kl": 0.2163374163210392, "learning_rate": 7.99963272375069e-06, "loss": -0.0458, "step": 1032, "step_time": 14.009269528003642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.4375, "completions/mean_terminated_length": 4.344827651977539, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.3607755820266902, "epoch": 0.01033, "frac_reward_zero_std": 0.25, "grad_norm": 0.028465131297707558, "kl": 0.34989420883357525, "learning_rate": 7.999631985893037e-06, "loss": -0.0604, "num_tokens": 27643495.0, "reward": 0.5805666446685791, "reward_std": 1.113153338432312, "rewards/rollout_reward_func/mean": 0.5805666446685791, "rewards/rollout_reward_func/std": 1.113153338432312, "sampling/importance_sampling_ratio/max": 1.1216446161270142, "sampling/importance_sampling_ratio/mean": 0.7972341179847717, "sampling/importance_sampling_ratio/min": 6.284048481575155e-08, "sampling/sampling_logp_difference/max": 2.599639892578125, "sampling/sampling_logp_difference/mean": 0.27970919013023376, "step": 1033, "step_time": 26.668851869981154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3714993195608258, "epoch": 0.01034, "grad_norm": 0.028492437675595284, "kl": 0.3489120081067085, "learning_rate": 7.999631247294993e-06, "loss": -0.0605, "step": 1034, "step_time": 13.376372665006784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.8125, "completions/mean_terminated_length": 4.692307949066162, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.0825206488370895, "epoch": 0.01035, "frac_reward_zero_std": 0.0, "grad_norm": 0.12327370792627335, "kl": 0.23557349387556314, "learning_rate": 7.99963050795656e-06, "loss": -0.0563, "num_tokens": 27696108.0, "reward": -0.20730891823768616, "reward_std": 1.0389125347137451, "rewards/rollout_reward_func/mean": -0.20730891823768616, "rewards/rollout_reward_func/std": 1.0389126539230347, "sampling/importance_sampling_ratio/max": 1.0877001285552979, "sampling/importance_sampling_ratio/mean": 0.6106776595115662, "sampling/importance_sampling_ratio/min": 8.782818916586166e-09, "sampling/sampling_logp_difference/max": 2.427504062652588, "sampling/sampling_logp_difference/mean": 0.4336656928062439, "step": 1035, "step_time": 26.74551093703485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0995179638266563, "epoch": 0.01036, "grad_norm": 0.12236727774143219, "kl": 0.22194364480674267, "learning_rate": 7.999629767877742e-06, "loss": -0.0568, "step": 1036, "step_time": 13.949621063991799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.375, "completions/mean_terminated_length": 4.960000038146973, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.788323998451233, "epoch": 0.01037, "frac_reward_zero_std": 0.25, "grad_norm": 0.05790955573320389, "kl": 0.221711540594697, "learning_rate": 7.999629027058533e-06, "loss": -0.0642, "num_tokens": 27750583.0, "reward": 0.5075881481170654, "reward_std": 1.3669052124023438, "rewards/rollout_reward_func/mean": 0.5075881481170654, "rewards/rollout_reward_func/std": 1.3669052124023438, "sampling/importance_sampling_ratio/max": 1.3056895732879639, "sampling/importance_sampling_ratio/mean": 0.6729989051818848, "sampling/importance_sampling_ratio/min": 4.304326637338818e-07, "sampling/sampling_logp_difference/max": 1.9265999794006348, "sampling/sampling_logp_difference/mean": 0.36574360728263855, "step": 1037, "step_time": 27.645113378981478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.801531933248043, "epoch": 0.01038, "grad_norm": 0.05413060262799263, "kl": 0.22156107984483242, "learning_rate": 7.999628285498937e-06, "loss": -0.0644, "step": 1038, "step_time": 13.138113940993208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 7.78125, "completions/mean_terminated_length": 4.5652174949646, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5346683603711426, "epoch": 0.01039, "frac_reward_zero_std": 0.25, "grad_norm": 0.10476535558700562, "kl": 0.2080998830497265, "learning_rate": 7.999627543198954e-06, "loss": -0.0542, "num_tokens": 27804685.0, "reward": 0.7471075654029846, "reward_std": 1.3517334461212158, "rewards/rollout_reward_func/mean": 0.7471075654029846, "rewards/rollout_reward_func/std": 1.3517334461212158, "sampling/importance_sampling_ratio/max": 1.1306973695755005, "sampling/importance_sampling_ratio/mean": 0.6274309158325195, "sampling/importance_sampling_ratio/min": 3.0405558391066734e-06, "sampling/sampling_logp_difference/max": 2.011385679244995, "sampling/sampling_logp_difference/mean": 0.2668971121311188, "step": 1039, "step_time": 31.306458966981154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5437731454148889, "epoch": 0.0104, "grad_norm": 0.10812864452600479, "kl": 0.21071890648454428, "learning_rate": 7.999626800158583e-06, "loss": -0.054, "step": 1040, "step_time": 12.628123545990093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.3125, "completions/mean_terminated_length": 4.4166669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8331014811992645, "epoch": 0.01041, "frac_reward_zero_std": 0.0, "grad_norm": 0.06770726293325424, "kl": 0.6143453381955624, "learning_rate": 7.999626056377823e-06, "loss": -0.083, "num_tokens": 27860367.0, "reward": 0.850868821144104, "reward_std": 1.2397205829620361, "rewards/rollout_reward_func/mean": 0.850868821144104, "rewards/rollout_reward_func/std": 1.2397205829620361, "sampling/importance_sampling_ratio/max": 1.1100562810897827, "sampling/importance_sampling_ratio/mean": 0.6271057724952698, "sampling/importance_sampling_ratio/min": 1.1667775368096045e-07, "sampling/sampling_logp_difference/max": 2.306654214859009, "sampling/sampling_logp_difference/mean": 0.36827272176742554, "step": 1041, "step_time": 30.26814237398503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8372583240270615, "epoch": 0.01042, "grad_norm": 0.07075626403093338, "kl": 0.6155557949095964, "learning_rate": 7.99962531185668e-06, "loss": -0.0828, "step": 1042, "step_time": 13.742428070996539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.46875, "completions/mean_terminated_length": 5.550000190734863, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2276374250650406, "epoch": 0.01043, "frac_reward_zero_std": 0.0, "grad_norm": 0.14182814955711365, "kl": 0.3186523551121354, "learning_rate": 7.999624566595146e-06, "loss": -0.0444, "num_tokens": 27925498.0, "reward": 0.3765103816986084, "reward_std": 1.243160367012024, "rewards/rollout_reward_func/mean": 0.3765103816986084, "rewards/rollout_reward_func/std": 1.2431602478027344, "sampling/importance_sampling_ratio/max": 1.1472419500350952, "sampling/importance_sampling_ratio/mean": 0.46241873502731323, "sampling/importance_sampling_ratio/min": 3.0721383836862515e-07, "sampling/sampling_logp_difference/max": 2.1163368225097656, "sampling/sampling_logp_difference/mean": 0.3387998342514038, "step": 1043, "step_time": 35.75894290098222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.244268462061882, "epoch": 0.01044, "grad_norm": 0.14293192327022552, "kl": 0.30835843831300735, "learning_rate": 7.999623820593227e-06, "loss": -0.045, "step": 1044, "step_time": 13.864413910006988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.78125, "completions/mean_terminated_length": 5.0416669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8973059598356485, "epoch": 0.01045, "frac_reward_zero_std": 0.0, "grad_norm": 0.052262190729379654, "kl": 0.3628650400787592, "learning_rate": 7.99962307385092e-06, "loss": -0.0958, "num_tokens": 27987859.0, "reward": 0.19016626477241516, "reward_std": 1.223580002784729, "rewards/rollout_reward_func/mean": 0.19016626477241516, "rewards/rollout_reward_func/std": 1.223580002784729, "sampling/importance_sampling_ratio/max": 1.2969380617141724, "sampling/importance_sampling_ratio/mean": 0.5813193321228027, "sampling/importance_sampling_ratio/min": 1.9851634078804636e-06, "sampling/sampling_logp_difference/max": 1.831478476524353, "sampling/sampling_logp_difference/mean": 0.4129830002784729, "step": 1045, "step_time": 31.855899207963375 }, { "clip_ratio/high_max": 0.008928571827709675, "clip_ratio/high_mean": 0.004464285913854837, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "entropy": 1.9284348729997873, "epoch": 0.01046, "grad_norm": 0.06023551896214485, "kl": 0.33302703127264977, "learning_rate": 7.999622326368228e-06, "loss": -0.0959, "step": 1046, "step_time": 16.330645756985177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.03125, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2793195638805628, "epoch": 0.01047, "frac_reward_zero_std": 0.0, "grad_norm": 0.176773339509964, "kl": 0.5608241129666567, "learning_rate": 7.999621578145148e-06, "loss": -0.0482, "num_tokens": 28032293.0, "reward": 0.1399764120578766, "reward_std": 1.3941266536712646, "rewards/rollout_reward_func/mean": 0.1399764120578766, "rewards/rollout_reward_func/std": 1.3941266536712646, "sampling/importance_sampling_ratio/max": 1.1619577407836914, "sampling/importance_sampling_ratio/mean": 0.7588589191436768, "sampling/importance_sampling_ratio/min": 2.3974329451448284e-05, "sampling/sampling_logp_difference/max": 1.797781229019165, "sampling/sampling_logp_difference/mean": 0.22394539415836334, "step": 1047, "step_time": 27.615308851993177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2943849638104439, "epoch": 0.01048, "grad_norm": 0.16524530947208405, "kl": 0.5547144375741482, "learning_rate": 7.999620829181686e-06, "loss": -0.0484, "step": 1048, "step_time": 13.081091408006614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.480000019073486, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4387493953108788, "epoch": 0.01049, "frac_reward_zero_std": 0.25, "grad_norm": 0.07293304800987244, "kl": 0.42784352228045464, "learning_rate": 7.999620079477833e-06, "loss": -0.0431, "num_tokens": 28076335.0, "reward": 0.800874650478363, "reward_std": 1.3750643730163574, "rewards/rollout_reward_func/mean": 0.800874650478363, "rewards/rollout_reward_func/std": 1.375064492225647, "sampling/importance_sampling_ratio/max": 1.0453033447265625, "sampling/importance_sampling_ratio/mean": 0.6710827350616455, "sampling/importance_sampling_ratio/min": 1.6822263205540366e-05, "sampling/sampling_logp_difference/max": 1.7223131656646729, "sampling/sampling_logp_difference/mean": 0.21347905695438385, "step": 1049, "step_time": 28.46310095100489 }, { "clip_ratio/high_max": 0.0059523810632526875, "clip_ratio/high_mean": 0.0029761905316263437, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0029761905316263437, "entropy": 1.4450715407729149, "epoch": 0.0105, "grad_norm": 0.0491156280040741, "kl": 0.3866303777322173, "learning_rate": 7.999619329033598e-06, "loss": -0.0434, "step": 1050, "step_time": 12.103243206001935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.480000019073486, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3156090695410967, "epoch": 0.01051, "frac_reward_zero_std": 0.25, "grad_norm": 0.05777231976389885, "kl": 0.33413010463118553, "learning_rate": 7.999618577848975e-06, "loss": -0.074, "num_tokens": 28126150.0, "reward": 1.1739751100540161, "reward_std": 1.2694816589355469, "rewards/rollout_reward_func/mean": 1.1739751100540161, "rewards/rollout_reward_func/std": 1.2694816589355469, "sampling/importance_sampling_ratio/max": 1.4185466766357422, "sampling/importance_sampling_ratio/mean": 0.7433109879493713, "sampling/importance_sampling_ratio/min": 5.771524229203351e-06, "sampling/sampling_logp_difference/max": 1.80791175365448, "sampling/sampling_logp_difference/mean": 0.22680459916591644, "step": 1051, "step_time": 35.050582968004164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3171973787248135, "epoch": 0.01052, "grad_norm": 0.055302951484918594, "kl": 0.33451036736369133, "learning_rate": 7.999617825923968e-06, "loss": -0.074, "step": 1052, "step_time": 15.229219819972059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.1875, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7288551814854145, "epoch": 0.01053, "frac_reward_zero_std": 0.25, "grad_norm": 0.07500389963388443, "kl": 0.5644523361697793, "learning_rate": 7.999617073258575e-06, "loss": -0.0631, "num_tokens": 28175414.0, "reward": 1.0174559354782104, "reward_std": 1.266404151916504, "rewards/rollout_reward_func/mean": 1.0174559354782104, "rewards/rollout_reward_func/std": 1.266404151916504, "sampling/importance_sampling_ratio/max": 1.074363112449646, "sampling/importance_sampling_ratio/mean": 0.6828510761260986, "sampling/importance_sampling_ratio/min": 7.544073014287278e-06, "sampling/sampling_logp_difference/max": 2.2939724922180176, "sampling/sampling_logp_difference/mean": 0.3321893811225891, "step": 1053, "step_time": 25.78161386899592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7323510814458132, "epoch": 0.01054, "grad_norm": 0.07400215417146683, "kl": 0.5514860488474369, "learning_rate": 7.999616319852798e-06, "loss": -0.0632, "step": 1054, "step_time": 12.357184519976727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.09375, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.1124257594347, "epoch": 0.01055, "frac_reward_zero_std": 0.0, "grad_norm": 0.10978524386882782, "kl": 0.2598565351217985, "learning_rate": 7.999615565706636e-06, "loss": -0.0691, "num_tokens": 28232422.0, "reward": 0.020244169980287552, "reward_std": 1.1276944875717163, "rewards/rollout_reward_func/mean": 0.020244169980287552, "rewards/rollout_reward_func/std": 1.1276946067810059, "sampling/importance_sampling_ratio/max": 1.1373493671417236, "sampling/importance_sampling_ratio/mean": 0.5501667261123657, "sampling/importance_sampling_ratio/min": 2.480026705597993e-05, "sampling/sampling_logp_difference/max": 1.8068079948425293, "sampling/sampling_logp_difference/mean": 0.32893359661102295, "step": 1055, "step_time": 30.836553500965238 }, { "clip_ratio/high_max": 0.008333333767950535, "clip_ratio/high_mean": 0.004166666883975267, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004166666883975267, "entropy": 2.1054166592657566, "epoch": 0.01056, "grad_norm": 0.10638661682605743, "kl": 0.2605165671557188, "learning_rate": 7.999614810820089e-06, "loss": -0.0691, "step": 1056, "step_time": 14.931843606973416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.125, "completions/mean_terminated_length": 6.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.651202257722616, "epoch": 0.01057, "frac_reward_zero_std": 0.0, "grad_norm": 0.08540225774049759, "kl": 0.3610877124592662, "learning_rate": 7.999614055193157e-06, "loss": -0.1026, "num_tokens": 28296760.0, "reward": 0.528245210647583, "reward_std": 1.3175599575042725, "rewards/rollout_reward_func/mean": 0.528245210647583, "rewards/rollout_reward_func/std": 1.317559838294983, "sampling/importance_sampling_ratio/max": 1.1668729782104492, "sampling/importance_sampling_ratio/mean": 0.4646734595298767, "sampling/importance_sampling_ratio/min": 1.8148450180888176e-05, "sampling/sampling_logp_difference/max": 1.7057496309280396, "sampling/sampling_logp_difference/mean": 0.3807542324066162, "step": 1057, "step_time": 33.27105182298692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.6311783138662577, "epoch": 0.01058, "grad_norm": 0.06465937942266464, "kl": 0.3561981599777937, "learning_rate": 7.999613298825842e-06, "loss": -0.1029, "step": 1058, "step_time": 14.199599343002774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.03125, "completions/mean_terminated_length": 4.91304349899292, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.207616785541177, "epoch": 0.01059, "frac_reward_zero_std": 0.0, "grad_norm": 0.11543673276901245, "kl": 0.2091167513281107, "learning_rate": 7.999612541718143e-06, "loss": -0.0825, "num_tokens": 28353743.0, "reward": 0.642658531665802, "reward_std": 1.1686711311340332, "rewards/rollout_reward_func/mean": 0.642658531665802, "rewards/rollout_reward_func/std": 1.1686712503433228, "sampling/importance_sampling_ratio/max": 1.3487529754638672, "sampling/importance_sampling_ratio/mean": 0.5935107469558716, "sampling/importance_sampling_ratio/min": 9.889100027749009e-08, "sampling/sampling_logp_difference/max": 1.945013165473938, "sampling/sampling_logp_difference/mean": 0.40595781803131104, "step": 1059, "step_time": 30.64996541396249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.178357971832156, "epoch": 0.0106, "grad_norm": 0.10328052192926407, "kl": 0.21108614467084408, "learning_rate": 7.99961178387006e-06, "loss": -0.0826, "step": 1060, "step_time": 13.977964989986503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 7.0625, "completions/mean_terminated_length": 4.083333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.694470800459385, "epoch": 0.01061, "frac_reward_zero_std": 0.0, "grad_norm": 0.09864899516105652, "kl": 0.19551722845062613, "learning_rate": 7.999611025281593e-06, "loss": -0.0755, "num_tokens": 28399266.0, "reward": 0.7956904768943787, "reward_std": 1.2703050374984741, "rewards/rollout_reward_func/mean": 0.7956904768943787, "rewards/rollout_reward_func/std": 1.2703050374984741, "sampling/importance_sampling_ratio/max": 1.103163719177246, "sampling/importance_sampling_ratio/mean": 0.6968985795974731, "sampling/importance_sampling_ratio/min": 1.086943484551739e-05, "sampling/sampling_logp_difference/max": 2.011983871459961, "sampling/sampling_logp_difference/mean": 0.33426910638809204, "step": 1061, "step_time": 25.081870972993784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.009765625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009765625, "entropy": 1.6709971372038126, "epoch": 0.01062, "grad_norm": 0.08599738776683807, "kl": 0.20322538074105978, "learning_rate": 7.999610265952743e-06, "loss": -0.0758, "step": 1062, "step_time": 12.517662893995293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.125, "completions/mean_terminated_length": 4.4210524559021, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.253272651694715, "epoch": 0.01063, "frac_reward_zero_std": 0.0, "grad_norm": 0.06320932507514954, "kl": 0.3278659046627581, "learning_rate": 7.99960950588351e-06, "loss": -0.0957, "num_tokens": 28459311.0, "reward": 0.5374172329902649, "reward_std": 1.2394804954528809, "rewards/rollout_reward_func/mean": 0.5374172329902649, "rewards/rollout_reward_func/std": 1.2394803762435913, "sampling/importance_sampling_ratio/max": 1.3131945133209229, "sampling/importance_sampling_ratio/mean": 0.5837449431419373, "sampling/importance_sampling_ratio/min": 1.527018582692108e-07, "sampling/sampling_logp_difference/max": 1.9667772054672241, "sampling/sampling_logp_difference/mean": 0.4231448173522949, "step": 1063, "step_time": 31.792780445015524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.2336913906037807, "epoch": 0.01064, "grad_norm": 0.05866284295916557, "kl": 0.3627744186669588, "learning_rate": 7.999608745073893e-06, "loss": -0.0958, "step": 1064, "step_time": 13.812666292040376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.65625, "completions/mean_terminated_length": 5.782608985900879, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2688619336113334, "epoch": 0.01065, "frac_reward_zero_std": 0.0, "grad_norm": 0.15756259858608246, "kl": 1.2511152131482959, "learning_rate": 7.999607983523892e-06, "loss": -0.0809, "num_tokens": 28518327.0, "reward": 0.6439201235771179, "reward_std": 1.1569775342941284, "rewards/rollout_reward_func/mean": 0.6439201235771179, "rewards/rollout_reward_func/std": 1.1569775342941284, "sampling/importance_sampling_ratio/max": 1.2254780530929565, "sampling/importance_sampling_ratio/mean": 0.5735501646995544, "sampling/importance_sampling_ratio/min": 6.630618720748771e-09, "sampling/sampling_logp_difference/max": 2.5744869709014893, "sampling/sampling_logp_difference/mean": 0.4061826467514038, "step": 1065, "step_time": 32.37172659399221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.262838203459978, "epoch": 0.01066, "grad_norm": 0.14781075716018677, "kl": 1.1872099041938782, "learning_rate": 7.999607221233511e-06, "loss": -0.0812, "step": 1066, "step_time": 15.045443816008628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.96875, "completions/mean_terminated_length": 4.111111164093018, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1153136948123574, "epoch": 0.01067, "frac_reward_zero_std": 0.5, "grad_norm": 0.05429987609386444, "kl": 0.26544367522001266, "learning_rate": 7.999606458202746e-06, "loss": -0.04, "num_tokens": 28554887.0, "reward": 1.5614418983459473, "reward_std": 0.9484038352966309, "rewards/rollout_reward_func/mean": 1.5614418983459473, "rewards/rollout_reward_func/std": 0.9484038352966309, "sampling/importance_sampling_ratio/max": 1.0594995021820068, "sampling/importance_sampling_ratio/mean": 0.8230868577957153, "sampling/importance_sampling_ratio/min": 2.3087019940248865e-07, "sampling/sampling_logp_difference/max": 2.207247018814087, "sampling/sampling_logp_difference/mean": 0.24883341789245605, "step": 1067, "step_time": 21.402137570999912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1162099433131516, "epoch": 0.01068, "grad_norm": 0.054073311388492584, "kl": 0.27397275529801846, "learning_rate": 7.999605694431597e-06, "loss": -0.0401, "step": 1068, "step_time": 11.272185421999893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.59375, "completions/mean_terminated_length": 5.227272987365723, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.4607450515031815, "epoch": 0.01069, "frac_reward_zero_std": 0.0, "grad_norm": 0.1365082710981369, "kl": 0.9536126768216491, "learning_rate": 7.99960492992007e-06, "loss": -0.075, "num_tokens": 28610953.0, "reward": 0.2992226481437683, "reward_std": 1.3450356721878052, "rewards/rollout_reward_func/mean": 0.2992226481437683, "rewards/rollout_reward_func/std": 1.3450356721878052, "sampling/importance_sampling_ratio/max": 1.2461774349212646, "sampling/importance_sampling_ratio/mean": 0.5619184970855713, "sampling/importance_sampling_ratio/min": 1.3932473166278214e-06, "sampling/sampling_logp_difference/max": 1.6025644540786743, "sampling/sampling_logp_difference/mean": 0.3673567771911621, "step": 1069, "step_time": 32.65406413299206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.4524005651474, "epoch": 0.0107, "grad_norm": 0.1358211636543274, "kl": 1.0070228800177574, "learning_rate": 7.999604164668158e-06, "loss": -0.0748, "step": 1070, "step_time": 13.626067664008588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 4.869565486907959, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.1431910321116447, "epoch": 0.01071, "frac_reward_zero_std": 0.0, "grad_norm": 0.13042792677879333, "kl": 0.22975269611924887, "learning_rate": 7.999603398675864e-06, "loss": -0.0857, "num_tokens": 28666398.0, "reward": 0.5127083659172058, "reward_std": 1.2522557973861694, "rewards/rollout_reward_func/mean": 0.5127083659172058, "rewards/rollout_reward_func/std": 1.2522557973861694, "sampling/importance_sampling_ratio/max": 1.13787841796875, "sampling/importance_sampling_ratio/mean": 0.6166452169418335, "sampling/importance_sampling_ratio/min": 4.593819653564424e-07, "sampling/sampling_logp_difference/max": 2.128549337387085, "sampling/sampling_logp_difference/mean": 0.4116744101047516, "step": 1071, "step_time": 31.45982726602233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.1375737413764, "epoch": 0.01072, "grad_norm": 0.12951141595840454, "kl": 0.22414661943912506, "learning_rate": 7.99960263194319e-06, "loss": -0.0859, "step": 1072, "step_time": 13.792652957999962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.125, "completions/mean_terminated_length": 5.920000076293945, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.179009258747101, "epoch": 0.01073, "frac_reward_zero_std": 0.0, "grad_norm": 0.09103073924779892, "kl": 0.42257796693593264, "learning_rate": 7.999601864470135e-06, "loss": -0.0803, "num_tokens": 28718821.0, "reward": 0.5306378602981567, "reward_std": 1.4028682708740234, "rewards/rollout_reward_func/mean": 0.5306378602981567, "rewards/rollout_reward_func/std": 1.4028681516647339, "sampling/importance_sampling_ratio/max": 1.3265578746795654, "sampling/importance_sampling_ratio/mean": 0.6114554405212402, "sampling/importance_sampling_ratio/min": 5.392771072365576e-06, "sampling/sampling_logp_difference/max": 1.730597734451294, "sampling/sampling_logp_difference/mean": 0.3372752070426941, "step": 1073, "step_time": 29.45060111900966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.1745919063687325, "epoch": 0.01074, "grad_norm": 0.09436382353305817, "kl": 0.4218642897903919, "learning_rate": 7.999601096256697e-06, "loss": -0.0803, "step": 1074, "step_time": 13.610624567998457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.009375000139698386, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009375000139698386, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.15625, "completions/mean_terminated_length": 6.34615421295166, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.5982604697346687, "epoch": 0.01075, "frac_reward_zero_std": 0.0, "grad_norm": 0.12617334723472595, "kl": 0.2740452354773879, "learning_rate": 7.999600327302877e-06, "loss": -0.0995, "num_tokens": 28773523.0, "reward": -0.18243980407714844, "reward_std": 1.2664662599563599, "rewards/rollout_reward_func/mean": -0.18243980407714844, "rewards/rollout_reward_func/std": 1.2664663791656494, "sampling/importance_sampling_ratio/max": 1.6283130645751953, "sampling/importance_sampling_ratio/mean": 0.5278632044792175, "sampling/importance_sampling_ratio/min": 4.592221216626058e-07, "sampling/sampling_logp_difference/max": 2.3204545974731445, "sampling/sampling_logp_difference/mean": 0.47537726163864136, "step": 1075, "step_time": 28.028544501008582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 2.5962559655308723, "epoch": 0.01076, "grad_norm": 0.12561701238155365, "kl": 0.2715200656093657, "learning_rate": 7.999599557608678e-06, "loss": -0.0997, "step": 1076, "step_time": 14.04187048599124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.9375, "completions/mean_terminated_length": 5.727272987365723, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.2470747400075197, "epoch": 0.01077, "frac_reward_zero_std": 0.0, "grad_norm": 0.0772310420870781, "kl": 0.27668708469718695, "learning_rate": 7.9995987871741e-06, "loss": -0.0632, "num_tokens": 28837463.0, "reward": 0.21138082444667816, "reward_std": 1.2718870639801025, "rewards/rollout_reward_func/mean": 0.21138082444667816, "rewards/rollout_reward_func/std": 1.2718870639801025, "sampling/importance_sampling_ratio/max": 1.2349790334701538, "sampling/importance_sampling_ratio/mean": 0.5809152126312256, "sampling/importance_sampling_ratio/min": 8.124268060782924e-05, "sampling/sampling_logp_difference/max": 2.1895785331726074, "sampling/sampling_logp_difference/mean": 0.34137094020843506, "step": 1077, "step_time": 36.67105942800117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.253598405048251, "epoch": 0.01078, "grad_norm": 0.0931023359298706, "kl": 0.30269960779696703, "learning_rate": 7.999598015999138e-06, "loss": -0.0633, "step": 1078, "step_time": 15.789532053997391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.9375, "completions/mean_terminated_length": 5.727272987365723, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.1186528904363513, "epoch": 0.01079, "frac_reward_zero_std": 0.0, "grad_norm": 0.16715721786022186, "kl": 0.8292125770822167, "learning_rate": 7.999597244083798e-06, "loss": -0.0943, "num_tokens": 28892882.0, "reward": 0.5703413486480713, "reward_std": 1.3000855445861816, "rewards/rollout_reward_func/mean": 0.5703413486480713, "rewards/rollout_reward_func/std": 1.3000855445861816, "sampling/importance_sampling_ratio/max": 1.29646897315979, "sampling/importance_sampling_ratio/mean": 0.5810850858688354, "sampling/importance_sampling_ratio/min": 1.336483251179743e-06, "sampling/sampling_logp_difference/max": 2.0811586380004883, "sampling/sampling_logp_difference/mean": 0.3755375146865845, "step": 1079, "step_time": 30.535403213987593 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "entropy": 2.1193512259051204, "epoch": 0.0108, "grad_norm": 0.12996584177017212, "kl": 0.7220260808244348, "learning_rate": 7.999596471428079e-06, "loss": -0.0948, "step": 1080, "step_time": 12.869648875988787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 4.615384578704834, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6028648912906647, "epoch": 0.01081, "frac_reward_zero_std": 0.0, "grad_norm": 0.17289753258228302, "kl": 0.5640266723930836, "learning_rate": 7.999595698031978e-06, "loss": -0.0438, "num_tokens": 28954695.0, "reward": 0.7225408554077148, "reward_std": 1.2391059398651123, "rewards/rollout_reward_func/mean": 0.7225408554077148, "rewards/rollout_reward_func/std": 1.2391059398651123, "sampling/importance_sampling_ratio/max": 1.4059224128723145, "sampling/importance_sampling_ratio/mean": 0.6876834630966187, "sampling/importance_sampling_ratio/min": 9.234587923856452e-05, "sampling/sampling_logp_difference/max": 1.5608248710632324, "sampling/sampling_logp_difference/mean": 0.2826305031776428, "step": 1081, "step_time": 28.744043545026216 }, { "clip_ratio/high_max": 0.014363354537636042, "clip_ratio/high_mean": 0.007181677268818021, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007181677268818021, "entropy": 1.6120274029672146, "epoch": 0.01082, "grad_norm": 0.16244010627269745, "kl": 0.4901031833142042, "learning_rate": 7.999594923895498e-06, "loss": -0.0447, "step": 1082, "step_time": 13.015529980009887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 5.818181991577148, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.9094931930303574, "epoch": 0.01083, "frac_reward_zero_std": 0.0, "grad_norm": 0.16051824390888214, "kl": 0.2942728325724602, "learning_rate": 7.999594149018638e-06, "loss": -0.0858, "num_tokens": 29010185.0, "reward": 0.03314190357923508, "reward_std": 1.3677308559417725, "rewards/rollout_reward_func/mean": 0.03314190357923508, "rewards/rollout_reward_func/std": 1.3677308559417725, "sampling/importance_sampling_ratio/max": 1.1264585256576538, "sampling/importance_sampling_ratio/mean": 0.4976809322834015, "sampling/importance_sampling_ratio/min": 1.6915149103624572e-07, "sampling/sampling_logp_difference/max": 2.378268241882324, "sampling/sampling_logp_difference/mean": 0.49391961097717285, "step": 1083, "step_time": 31.536189144011587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.909715414047241, "epoch": 0.01084, "grad_norm": 0.14873242378234863, "kl": 0.2657337477430701, "learning_rate": 7.9995933734014e-06, "loss": -0.0856, "step": 1084, "step_time": 14.632066289006616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.625, "completions/mean_terminated_length": 4.200000286102295, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.904539623297751, "epoch": 0.01085, "frac_reward_zero_std": 0.0, "grad_norm": 0.04030067101120949, "kl": 0.20800893427804112, "learning_rate": 7.99959259704378e-06, "loss": -0.069, "num_tokens": 29063513.0, "reward": 0.7528938055038452, "reward_std": 1.3526229858398438, "rewards/rollout_reward_func/mean": 0.7528938055038452, "rewards/rollout_reward_func/std": 1.3526228666305542, "sampling/importance_sampling_ratio/max": 1.0894782543182373, "sampling/importance_sampling_ratio/mean": 0.5956878662109375, "sampling/importance_sampling_ratio/min": 1.8566224753158167e-06, "sampling/sampling_logp_difference/max": 1.7866942882537842, "sampling/sampling_logp_difference/mean": 0.3888024091720581, "step": 1085, "step_time": 29.477665907004848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9072826644405723, "epoch": 0.01086, "grad_norm": 0.04133687540888786, "kl": 0.20211538230068982, "learning_rate": 7.999591819945785e-06, "loss": -0.0688, "step": 1086, "step_time": 13.938716022006702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 5.6875, "completions/mean_terminated_length": 4.620689868927002, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1955769015476108, "epoch": 0.01087, "frac_reward_zero_std": 0.25, "grad_norm": 0.12179865688085556, "kl": 0.7350202165544033, "learning_rate": 7.999591042107408e-06, "loss": -0.0509, "num_tokens": 29114767.0, "reward": 1.1824500560760498, "reward_std": 1.0022083520889282, "rewards/rollout_reward_func/mean": 1.1824500560760498, "rewards/rollout_reward_func/std": 1.0022084712982178, "sampling/importance_sampling_ratio/max": 1.2297990322113037, "sampling/importance_sampling_ratio/mean": 0.7832853198051453, "sampling/importance_sampling_ratio/min": 0.0001674074592301622, "sampling/sampling_logp_difference/max": 1.9658515453338623, "sampling/sampling_logp_difference/mean": 0.2441961169242859, "step": 1087, "step_time": 27.597118656005478 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "entropy": 1.198658151552081, "epoch": 0.01088, "grad_norm": 0.07780741155147552, "kl": 0.6339957173913717, "learning_rate": 7.999590263528655e-06, "loss": -0.0512, "step": 1088, "step_time": 14.435075316985603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 7.03125, "completions/mean_terminated_length": 5.750000476837158, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5954938624054193, "epoch": 0.01089, "frac_reward_zero_std": 0.25, "grad_norm": 0.03385436162352562, "kl": 0.26937560737133026, "learning_rate": 7.999589484209522e-06, "loss": -0.0502, "num_tokens": 29171655.0, "reward": 0.5204365849494934, "reward_std": 1.2639479637145996, "rewards/rollout_reward_func/mean": 0.5204365849494934, "rewards/rollout_reward_func/std": 1.2639477252960205, "sampling/importance_sampling_ratio/max": 1.1785273551940918, "sampling/importance_sampling_ratio/mean": 0.6440962553024292, "sampling/importance_sampling_ratio/min": 1.3768782991974149e-05, "sampling/sampling_logp_difference/max": 1.591392993927002, "sampling/sampling_logp_difference/mean": 0.29225754737854004, "step": 1089, "step_time": 34.94263014399621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.606195379048586, "epoch": 0.0109, "grad_norm": 0.03562043234705925, "kl": 0.2625946197658777, "learning_rate": 7.999588704150011e-06, "loss": -0.0501, "step": 1090, "step_time": 15.845898102998035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.96875, "completions/mean_terminated_length": 5.285714149475098, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.1980492249131203, "epoch": 0.01091, "frac_reward_zero_std": 0.0, "grad_norm": 0.08067813515663147, "kl": 0.2668177937157452, "learning_rate": 7.999587923350123e-06, "loss": -0.0607, "num_tokens": 29235527.0, "reward": 0.4315764605998993, "reward_std": 1.2217671871185303, "rewards/rollout_reward_func/mean": 0.4315764605998993, "rewards/rollout_reward_func/std": 1.2217670679092407, "sampling/importance_sampling_ratio/max": 1.366419792175293, "sampling/importance_sampling_ratio/mean": 0.5257430672645569, "sampling/importance_sampling_ratio/min": 1.0569232472334988e-05, "sampling/sampling_logp_difference/max": 1.9928921461105347, "sampling/sampling_logp_difference/mean": 0.3798980414867401, "step": 1091, "step_time": 32.71665670200309 }, { "clip_ratio/high_max": 0.01785714365541935, "clip_ratio/high_mean": 0.008928571827709675, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.016741071827709675, "entropy": 2.208162359893322, "epoch": 0.01092, "grad_norm": 0.05415384843945503, "kl": 0.2582878381945193, "learning_rate": 7.999587141809856e-06, "loss": -0.0609, "step": 1092, "step_time": 14.170065858008456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.46875, "completions/mean_terminated_length": 4.379310131072998, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0011746548116207, "epoch": 0.01093, "frac_reward_zero_std": 0.5, "grad_norm": 0.08503407984972, "kl": 0.2472492828965187, "learning_rate": 7.999586359529212e-06, "loss": -0.0297, "num_tokens": 29273586.0, "reward": 0.9819364547729492, "reward_std": 1.3131885528564453, "rewards/rollout_reward_func/mean": 0.9819364547729492, "rewards/rollout_reward_func/std": 1.3131885528564453, "sampling/importance_sampling_ratio/max": 1.075926661491394, "sampling/importance_sampling_ratio/mean": 0.8669313192367554, "sampling/importance_sampling_ratio/min": 1.9671925599595852e-07, "sampling/sampling_logp_difference/max": 2.1164774894714355, "sampling/sampling_logp_difference/mean": 0.20071861147880554, "step": 1093, "step_time": 21.270604530000128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9972070064395666, "epoch": 0.01094, "grad_norm": 0.08580050617456436, "kl": 0.24865382444113493, "learning_rate": 7.99958557650819e-06, "loss": -0.0299, "step": 1094, "step_time": 10.94651289700414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.8125, "completions/mean_terminated_length": 5.5789475440979, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.589526057243347, "epoch": 0.01095, "frac_reward_zero_std": 0.0, "grad_norm": 0.0383034273982048, "kl": 0.16018053889274597, "learning_rate": 7.999584792746792e-06, "loss": -0.0788, "num_tokens": 29335826.0, "reward": 0.4351346790790558, "reward_std": 1.313158392906189, "rewards/rollout_reward_func/mean": 0.4351346790790558, "rewards/rollout_reward_func/std": 1.313158392906189, "sampling/importance_sampling_ratio/max": 1.2202792167663574, "sampling/importance_sampling_ratio/mean": 0.4415644705295563, "sampling/importance_sampling_ratio/min": 2.046814188361168e-06, "sampling/sampling_logp_difference/max": 1.9463768005371094, "sampling/sampling_logp_difference/mean": 0.4314500689506531, "step": 1095, "step_time": 39.35063455400814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.5838934183120728, "epoch": 0.01096, "grad_norm": 0.03743673861026764, "kl": 0.15915028052404523, "learning_rate": 7.999584008245017e-06, "loss": -0.0788, "step": 1096, "step_time": 15.860504023003159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0625, "completions/mean_terminated_length": 4.454545497894287, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8870489802211523, "epoch": 0.01097, "frac_reward_zero_std": 0.25, "grad_norm": 0.11419471353292465, "kl": 0.143034178763628, "learning_rate": 7.999583223002866e-06, "loss": -0.0788, "num_tokens": 29392020.0, "reward": 0.7989781498908997, "reward_std": 1.3371998071670532, "rewards/rollout_reward_func/mean": 0.7989781498908997, "rewards/rollout_reward_func/std": 1.3371998071670532, "sampling/importance_sampling_ratio/max": 1.1696115732192993, "sampling/importance_sampling_ratio/mean": 0.6401657462120056, "sampling/importance_sampling_ratio/min": 2.364645069974358e-06, "sampling/sampling_logp_difference/max": 2.5784385204315186, "sampling/sampling_logp_difference/mean": 0.30978918075561523, "step": 1097, "step_time": 31.644872061995557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8717854376882315, "epoch": 0.01098, "grad_norm": 0.10226437449455261, "kl": 0.14552093297243118, "learning_rate": 7.999582437020337e-06, "loss": -0.0795, "step": 1098, "step_time": 13.841912818999845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 9.03125, "completions/mean_terminated_length": 5.863636493682861, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.4343655295670033, "epoch": 0.01099, "frac_reward_zero_std": 0.0, "grad_norm": 0.08172870427370071, "kl": 0.46691515017300844, "learning_rate": 7.999581650297431e-06, "loss": -0.0653, "num_tokens": 29456789.0, "reward": 0.1949537843465805, "reward_std": 1.0819783210754395, "rewards/rollout_reward_func/mean": 0.1949537843465805, "rewards/rollout_reward_func/std": 1.0819783210754395, "sampling/importance_sampling_ratio/max": 1.267104983329773, "sampling/importance_sampling_ratio/mean": 0.5162468552589417, "sampling/importance_sampling_ratio/min": 5.515218504115182e-07, "sampling/sampling_logp_difference/max": 1.8915610313415527, "sampling/sampling_logp_difference/mean": 0.42852744460105896, "step": 1099, "step_time": 33.53915469300409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.425469495356083, "epoch": 0.011, "grad_norm": 0.0769071951508522, "kl": 0.48805052787065506, "learning_rate": 7.999580862834148e-06, "loss": -0.0657, "step": 1100, "step_time": 14.044934038014617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.9375, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0743207838386297, "epoch": 0.01101, "frac_reward_zero_std": 0.5, "grad_norm": 0.02218817174434662, "kl": 0.37773499451577663, "learning_rate": 7.999580074630491e-06, "loss": -0.0471, "num_tokens": 29501441.0, "reward": 1.4717494249343872, "reward_std": 1.0552542209625244, "rewards/rollout_reward_func/mean": 1.4717494249343872, "rewards/rollout_reward_func/std": 1.0552542209625244, "sampling/importance_sampling_ratio/max": 1.079380989074707, "sampling/importance_sampling_ratio/mean": 0.839795708656311, "sampling/importance_sampling_ratio/min": 2.0434976249816827e-05, "sampling/sampling_logp_difference/max": 1.6944118738174438, "sampling/sampling_logp_difference/mean": 0.18042925000190735, "step": 1101, "step_time": 28.634895406023134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0685284165665507, "epoch": 0.01102, "grad_norm": 0.0285042617470026, "kl": 0.40339029021561146, "learning_rate": 7.999579285686457e-06, "loss": -0.047, "step": 1102, "step_time": 13.902104014981887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.15625, "completions/mean_terminated_length": 5.08695650100708, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.0331934802234173, "epoch": 0.01103, "frac_reward_zero_std": 0.0, "grad_norm": 0.18059584498405457, "kl": 0.37333339639008045, "learning_rate": 7.99957849600205e-06, "loss": -0.0521, "num_tokens": 29554460.0, "reward": -0.38143613934516907, "reward_std": 0.9211606979370117, "rewards/rollout_reward_func/mean": -0.38143613934516907, "rewards/rollout_reward_func/std": 0.9211606383323669, "sampling/importance_sampling_ratio/max": 1.3514220714569092, "sampling/importance_sampling_ratio/mean": 0.5055606365203857, "sampling/importance_sampling_ratio/min": 1.7884727299133374e-07, "sampling/sampling_logp_difference/max": 1.8796062469482422, "sampling/sampling_logp_difference/mean": 0.35934382677078247, "step": 1103, "step_time": 33.62990363400604 }, { "clip_ratio/high_max": 0.03750000009313226, "clip_ratio/high_mean": 0.01875000004656613, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01875000004656613, "entropy": 1.9962172619998455, "epoch": 0.01104, "grad_norm": 0.08767246454954147, "kl": 0.3645666530355811, "learning_rate": 7.999577705577265e-06, "loss": -0.0533, "step": 1104, "step_time": 15.525230651983293 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 4.148148059844971, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2436199542135, "epoch": 0.01105, "frac_reward_zero_std": 0.25, "grad_norm": 0.19354532659053802, "kl": 1.1780412243679166, "learning_rate": 7.999576914412106e-06, "loss": -0.0449, "num_tokens": 29608760.0, "reward": 1.1221072673797607, "reward_std": 1.183715581893921, "rewards/rollout_reward_func/mean": 1.1221072673797607, "rewards/rollout_reward_func/std": 1.1837154626846313, "sampling/importance_sampling_ratio/max": 1.1264725923538208, "sampling/importance_sampling_ratio/mean": 0.7986712455749512, "sampling/importance_sampling_ratio/min": 1.6967327383099473e-06, "sampling/sampling_logp_difference/max": 1.9642724990844727, "sampling/sampling_logp_difference/mean": 0.2852843105792999, "step": 1105, "step_time": 30.19843207199301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2378033706918359, "epoch": 0.01106, "grad_norm": 0.19451768696308136, "kl": 1.214645997621119, "learning_rate": 7.999576122506571e-06, "loss": -0.0442, "step": 1106, "step_time": 14.933119468012592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 6.875, "completions/mean_terminated_length": 5.185185432434082, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5002991193905473, "epoch": 0.01107, "frac_reward_zero_std": 0.0, "grad_norm": 0.1691911518573761, "kl": 0.9424042366445065, "learning_rate": 7.999575329860661e-06, "loss": -0.0544, "num_tokens": 29660504.0, "reward": 0.5825067758560181, "reward_std": 1.1908178329467773, "rewards/rollout_reward_func/mean": 0.5825067758560181, "rewards/rollout_reward_func/std": 1.1908177137374878, "sampling/importance_sampling_ratio/max": 1.1170471906661987, "sampling/importance_sampling_ratio/mean": 0.6520093083381653, "sampling/importance_sampling_ratio/min": 3.6721550713991746e-05, "sampling/sampling_logp_difference/max": 1.90933358669281, "sampling/sampling_logp_difference/mean": 0.343411922454834, "step": 1107, "step_time": 28.825228998000966 }, { "clip_ratio/high_max": 0.004629629664123058, "clip_ratio/high_mean": 0.002314814832061529, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002314814832061529, "entropy": 1.5154809337109327, "epoch": 0.01108, "grad_norm": 0.16234774887561798, "kl": 0.8619656935334206, "learning_rate": 7.999574536474376e-06, "loss": -0.0551, "step": 1108, "step_time": 14.464777751010843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.84375, "completions/mean_terminated_length": 4.730769634246826, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3709142096340656, "epoch": 0.01109, "frac_reward_zero_std": 0.25, "grad_norm": 0.06022389978170395, "kl": 0.25296452082693577, "learning_rate": 7.999573742347718e-06, "loss": -0.0631, "num_tokens": 29712709.0, "reward": 0.7986736297607422, "reward_std": 1.2057417631149292, "rewards/rollout_reward_func/mean": 0.7986736297607422, "rewards/rollout_reward_func/std": 1.2057417631149292, "sampling/importance_sampling_ratio/max": 1.246848464012146, "sampling/importance_sampling_ratio/mean": 0.7719182968139648, "sampling/importance_sampling_ratio/min": 2.461240740103676e-07, "sampling/sampling_logp_difference/max": 2.0450596809387207, "sampling/sampling_logp_difference/mean": 0.30254361033439636, "step": 1109, "step_time": 27.93464299099287 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "entropy": 1.3906934782862663, "epoch": 0.0111, "grad_norm": 0.06191449984908104, "kl": 0.24209484737366438, "learning_rate": 7.999572947480686e-06, "loss": -0.0632, "step": 1110, "step_time": 14.01876168398303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.15625, "completions/mean_terminated_length": 5.08695650100708, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.056567892432213, "epoch": 0.01111, "frac_reward_zero_std": 0.0, "grad_norm": 0.08112872391939163, "kl": 0.31739156832918525, "learning_rate": 7.999572151873277e-06, "loss": -0.0838, "num_tokens": 29776016.0, "reward": 0.4536041021347046, "reward_std": 1.2814671993255615, "rewards/rollout_reward_func/mean": 0.4536041021347046, "rewards/rollout_reward_func/std": 1.2814671993255615, "sampling/importance_sampling_ratio/max": 1.2244412899017334, "sampling/importance_sampling_ratio/mean": 0.5824019908905029, "sampling/importance_sampling_ratio/min": 4.063042240431969e-07, "sampling/sampling_logp_difference/max": 1.6813554763793945, "sampling/sampling_logp_difference/mean": 0.37124693393707275, "step": 1111, "step_time": 36.516586291007116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0751896649599075, "epoch": 0.01112, "grad_norm": 0.08589069545269012, "kl": 0.3046872019767761, "learning_rate": 7.999571355525498e-06, "loss": -0.0839, "step": 1112, "step_time": 16.457260499024414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 4.740740776062012, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5625781137496233, "epoch": 0.01113, "frac_reward_zero_std": 0.25, "grad_norm": 0.0576004795730114, "kl": 0.2201226307079196, "learning_rate": 7.999570558437343e-06, "loss": -0.0628, "num_tokens": 29828794.0, "reward": 0.5873219966888428, "reward_std": 1.3684287071228027, "rewards/rollout_reward_func/mean": 0.5873219966888428, "rewards/rollout_reward_func/std": 1.3684287071228027, "sampling/importance_sampling_ratio/max": 1.2287936210632324, "sampling/importance_sampling_ratio/mean": 0.7479391098022461, "sampling/importance_sampling_ratio/min": 1.1873677152607343e-07, "sampling/sampling_logp_difference/max": 2.4479384422302246, "sampling/sampling_logp_difference/mean": 0.25471505522727966, "step": 1113, "step_time": 32.95937912799127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5810222141444683, "epoch": 0.01114, "grad_norm": 0.06454763561487198, "kl": 0.21555488649755716, "learning_rate": 7.999569760608814e-06, "loss": -0.0628, "step": 1114, "step_time": 14.98995853998349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 7.28125, "completions/mean_terminated_length": 4.839999675750732, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.3570620864629745, "epoch": 0.01115, "frac_reward_zero_std": 0.0, "grad_norm": 0.021322498098015785, "kl": 0.4291822202503681, "learning_rate": 7.999568962039914e-06, "loss": -0.0871, "num_tokens": 29877913.0, "reward": 0.6713842749595642, "reward_std": 1.2812917232513428, "rewards/rollout_reward_func/mean": 0.6713842749595642, "rewards/rollout_reward_func/std": 1.2812917232513428, "sampling/importance_sampling_ratio/max": 1.0829665660858154, "sampling/importance_sampling_ratio/mean": 0.6298973560333252, "sampling/importance_sampling_ratio/min": 1.0531162530566518e-12, "sampling/sampling_logp_difference/max": 3.135033130645752, "sampling/sampling_logp_difference/mean": 0.49645495414733887, "step": 1115, "step_time": 29.490416897984687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.3654220290482044, "epoch": 0.01116, "grad_norm": 0.023836519569158554, "kl": 0.4017484951764345, "learning_rate": 7.99956816273064e-06, "loss": -0.087, "step": 1116, "step_time": 15.096203151028021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 6.875, "completions/mean_terminated_length": 4.319999694824219, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7183717843145132, "epoch": 0.01117, "frac_reward_zero_std": 0.0, "grad_norm": 0.08616286516189575, "kl": 0.3155640587210655, "learning_rate": 7.999567362680992e-06, "loss": -0.0702, "num_tokens": 29925689.0, "reward": 0.7430245876312256, "reward_std": 1.2622735500335693, "rewards/rollout_reward_func/mean": 0.7430245876312256, "rewards/rollout_reward_func/std": 1.2622735500335693, "sampling/importance_sampling_ratio/max": 1.2469093799591064, "sampling/importance_sampling_ratio/mean": 0.7178699970245361, "sampling/importance_sampling_ratio/min": 7.192881639639381e-06, "sampling/sampling_logp_difference/max": 1.9886815547943115, "sampling/sampling_logp_difference/mean": 0.32008862495422363, "step": 1117, "step_time": 26.318979853997007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7316968850791454, "epoch": 0.01118, "grad_norm": 0.0903438851237297, "kl": 0.2952552502974868, "learning_rate": 7.999566561890972e-06, "loss": -0.0706, "step": 1118, "step_time": 12.278776242994354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.96875, "completions/mean_terminated_length": 5.678571701049805, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.061338946223259, "epoch": 0.01119, "frac_reward_zero_std": 0.0, "grad_norm": 0.1735440343618393, "kl": 0.260301923379302, "learning_rate": 7.999565760360578e-06, "loss": -0.0823, "num_tokens": 29977121.0, "reward": -0.17834120988845825, "reward_std": 1.1447134017944336, "rewards/rollout_reward_func/mean": -0.17834120988845825, "rewards/rollout_reward_func/std": 1.1447134017944336, "sampling/importance_sampling_ratio/max": 1.1783429384231567, "sampling/importance_sampling_ratio/mean": 0.6153095960617065, "sampling/importance_sampling_ratio/min": 1.4669400343336747e-06, "sampling/sampling_logp_difference/max": 1.912014126777649, "sampling/sampling_logp_difference/mean": 0.3352658152580261, "step": 1119, "step_time": 30.138515516009647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.046583831310272, "epoch": 0.0112, "grad_norm": 0.1699945628643036, "kl": 0.2680789455771446, "learning_rate": 7.999564958089814e-06, "loss": -0.0827, "step": 1120, "step_time": 14.783911057980731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.9375, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5010821092873812, "epoch": 0.01121, "frac_reward_zero_std": 0.25, "grad_norm": 0.03139743208885193, "kl": 0.49044570606201887, "learning_rate": 7.999564155078678e-06, "loss": -0.0767, "num_tokens": 30023322.0, "reward": 1.1882978677749634, "reward_std": 1.2242377996444702, "rewards/rollout_reward_func/mean": 1.1882978677749634, "rewards/rollout_reward_func/std": 1.2242376804351807, "sampling/importance_sampling_ratio/max": 1.2406493425369263, "sampling/importance_sampling_ratio/mean": 0.7694371938705444, "sampling/importance_sampling_ratio/min": 2.099126322718803e-05, "sampling/sampling_logp_difference/max": 1.9624688625335693, "sampling/sampling_logp_difference/mean": 0.28862541913986206, "step": 1121, "step_time": 25.226247129001422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4942222610116005, "epoch": 0.01122, "grad_norm": 0.030547339469194412, "kl": 0.5289065418764949, "learning_rate": 7.999563351327168e-06, "loss": -0.0767, "step": 1122, "step_time": 11.831519909988856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 6.9375, "completions/mean_terminated_length": 4.400000095367432, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7759912014007568, "epoch": 0.01123, "frac_reward_zero_std": 0.25, "grad_norm": 0.0879656970500946, "kl": 0.2874838411808014, "learning_rate": 7.999562546835289e-06, "loss": -0.0423, "num_tokens": 30078532.0, "reward": 0.3254643678665161, "reward_std": 1.258151650428772, "rewards/rollout_reward_func/mean": 0.3254643678665161, "rewards/rollout_reward_func/std": 1.258151650428772, "sampling/importance_sampling_ratio/max": 1.1419801712036133, "sampling/importance_sampling_ratio/mean": 0.6955295205116272, "sampling/importance_sampling_ratio/min": 2.2808355879533337e-06, "sampling/sampling_logp_difference/max": 2.171339750289917, "sampling/sampling_logp_difference/mean": 0.32334986329078674, "step": 1123, "step_time": 32.03744143998483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7641060277819633, "epoch": 0.01124, "grad_norm": 0.08855796605348587, "kl": 0.298254013992846, "learning_rate": 7.999561741603036e-06, "loss": -0.0426, "step": 1124, "step_time": 15.513256848993478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 8.125, "completions/mean_terminated_length": 4.545454502105713, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2668730169534683, "epoch": 0.01125, "frac_reward_zero_std": 0.25, "grad_norm": 0.11034658551216125, "kl": 0.16661735926754773, "learning_rate": 7.999560935630413e-06, "loss": -0.07, "num_tokens": 30129750.0, "reward": 0.4109037518501282, "reward_std": 1.3415353298187256, "rewards/rollout_reward_func/mean": 0.4109037518501282, "rewards/rollout_reward_func/std": 1.341535210609436, "sampling/importance_sampling_ratio/max": 1.2063766717910767, "sampling/importance_sampling_ratio/mean": 0.6306101083755493, "sampling/importance_sampling_ratio/min": 6.49700595545255e-08, "sampling/sampling_logp_difference/max": 2.048928737640381, "sampling/sampling_logp_difference/mean": 0.3836723566055298, "step": 1125, "step_time": 27.700005468985182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.246053695678711, "epoch": 0.01126, "grad_norm": 0.10838324576616287, "kl": 0.16419571777805686, "learning_rate": 7.99956012891742e-06, "loss": -0.0707, "step": 1126, "step_time": 13.16452586997184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.25, "completions/mean_terminated_length": 4.799999713897705, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.466740452684462, "epoch": 0.01127, "frac_reward_zero_std": 0.25, "grad_norm": 0.028061792254447937, "kl": 0.3107897797599435, "learning_rate": 7.999559321464054e-06, "loss": -0.0826, "num_tokens": 30177780.0, "reward": 1.0553927421569824, "reward_std": 1.311907410621643, "rewards/rollout_reward_func/mean": 1.0553927421569824, "rewards/rollout_reward_func/std": 1.311907410621643, "sampling/importance_sampling_ratio/max": 1.125571370124817, "sampling/importance_sampling_ratio/mean": 0.7400088310241699, "sampling/importance_sampling_ratio/min": 7.281792932190001e-06, "sampling/sampling_logp_difference/max": 1.493186593055725, "sampling/sampling_logp_difference/mean": 0.23318514227867126, "step": 1127, "step_time": 30.601964795976528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.457545212469995, "epoch": 0.01128, "grad_norm": 0.032960835844278336, "kl": 0.3369014114141464, "learning_rate": 7.999558513270316e-06, "loss": -0.0826, "step": 1128, "step_time": 14.311262101997272 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.125, "completions/mean_terminated_length": 5.076923370361328, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7457075640559196, "epoch": 0.01129, "frac_reward_zero_std": 0.25, "grad_norm": 0.18066348135471344, "kl": 0.35896027460694313, "learning_rate": 7.999557704336211e-06, "loss": -0.0316, "num_tokens": 30227751.0, "reward": 0.2530360221862793, "reward_std": 1.3474379777908325, "rewards/rollout_reward_func/mean": 0.2530360221862793, "rewards/rollout_reward_func/std": 1.3474379777908325, "sampling/importance_sampling_ratio/max": 1.3756484985351562, "sampling/importance_sampling_ratio/mean": 0.6976557970046997, "sampling/importance_sampling_ratio/min": 5.044357180850056e-07, "sampling/sampling_logp_difference/max": 2.191620349884033, "sampling/sampling_logp_difference/mean": 0.32290124893188477, "step": 1129, "step_time": 25.911963876962545 }, { "clip_ratio/high_max": 0.022727273404598236, "clip_ratio/high_mean": 0.011363636702299118, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011363636702299118, "entropy": 1.7028870284557343, "epoch": 0.0113, "grad_norm": 0.15224872529506683, "kl": 0.39004579186439514, "learning_rate": 7.999556894661735e-06, "loss": -0.0314, "step": 1130, "step_time": 13.29845428004046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.5625, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7535864263772964, "epoch": 0.01131, "frac_reward_zero_std": 0.0, "grad_norm": 0.05079659819602966, "kl": 0.20269200950860977, "learning_rate": 7.999556084246888e-06, "loss": -0.1011, "num_tokens": 30284178.0, "reward": 0.9191474914550781, "reward_std": 1.2290148735046387, "rewards/rollout_reward_func/mean": 0.9191474914550781, "rewards/rollout_reward_func/std": 1.2290147542953491, "sampling/importance_sampling_ratio/max": 1.2985594272613525, "sampling/importance_sampling_ratio/mean": 0.6774352788925171, "sampling/importance_sampling_ratio/min": 4.01564506091745e-07, "sampling/sampling_logp_difference/max": 2.067577362060547, "sampling/sampling_logp_difference/mean": 0.30911749601364136, "step": 1131, "step_time": 28.87342051201267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7516220407560468, "epoch": 0.01132, "grad_norm": 0.04927363246679306, "kl": 0.20425910037010908, "learning_rate": 7.999555273091671e-06, "loss": -0.1012, "step": 1132, "step_time": 13.768230744972243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.875, "completions/mean_terminated_length": 4.769230842590332, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7679675966501236, "epoch": 0.01133, "frac_reward_zero_std": 0.0, "grad_norm": 0.1561540812253952, "kl": 0.2760644908994436, "learning_rate": 7.999554461196083e-06, "loss": -0.0584, "num_tokens": 30342117.0, "reward": 0.015312440693378448, "reward_std": 1.2377756834030151, "rewards/rollout_reward_func/mean": 0.015312440693378448, "rewards/rollout_reward_func/std": 1.2377756834030151, "sampling/importance_sampling_ratio/max": 1.1559770107269287, "sampling/importance_sampling_ratio/mean": 0.6659894585609436, "sampling/importance_sampling_ratio/min": 0.0001054650274454616, "sampling/sampling_logp_difference/max": 1.9642224311828613, "sampling/sampling_logp_difference/mean": 0.31696373224258423, "step": 1133, "step_time": 29.071260707001784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7744082286953926, "epoch": 0.01134, "grad_norm": 0.18111135065555573, "kl": 0.2689170967787504, "learning_rate": 7.999553648560128e-06, "loss": -0.0593, "step": 1134, "step_time": 14.211019202964962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 7.8125, "completions/mean_terminated_length": 4.6086955070495605, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7614568378776312, "epoch": 0.01135, "frac_reward_zero_std": 0.0, "grad_norm": 0.2641589343547821, "kl": 0.25597216188907623, "learning_rate": 7.999552835183802e-06, "loss": -0.1018, "num_tokens": 30393423.0, "reward": 0.8745843172073364, "reward_std": 1.2902436256408691, "rewards/rollout_reward_func/mean": 0.8745843172073364, "rewards/rollout_reward_func/std": 1.2902436256408691, "sampling/importance_sampling_ratio/max": 1.5222952365875244, "sampling/importance_sampling_ratio/mean": 0.6533483862876892, "sampling/importance_sampling_ratio/min": 6.292309262789786e-05, "sampling/sampling_logp_difference/max": 1.9744164943695068, "sampling/sampling_logp_difference/mean": 0.2804012596607208, "step": 1135, "step_time": 35.572180626011686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7619808604940772, "epoch": 0.01136, "grad_norm": 0.2364952266216278, "kl": 0.26844569481909275, "learning_rate": 7.99955202106711e-06, "loss": -0.1024, "step": 1136, "step_time": 14.09106427601364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.96875, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.00716133415699, "epoch": 0.01137, "frac_reward_zero_std": 0.0, "grad_norm": 0.07246755063533783, "kl": 0.23784038820303977, "learning_rate": 7.999551206210045e-06, "loss": -0.0987, "num_tokens": 30452691.0, "reward": 0.05023771524429321, "reward_std": 1.202385425567627, "rewards/rollout_reward_func/mean": 0.05023771524429321, "rewards/rollout_reward_func/std": 1.2023853063583374, "sampling/importance_sampling_ratio/max": 1.171090006828308, "sampling/importance_sampling_ratio/mean": 0.5784920454025269, "sampling/importance_sampling_ratio/min": 1.1378366338021806e-07, "sampling/sampling_logp_difference/max": 2.4348676204681396, "sampling/sampling_logp_difference/mean": 0.3851782977581024, "step": 1137, "step_time": 33.02776148202247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9988702479749918, "epoch": 0.01138, "grad_norm": 0.07085876911878586, "kl": 0.2349588219076395, "learning_rate": 7.999550390612613e-06, "loss": -0.0987, "step": 1138, "step_time": 14.093697062999127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 10.375, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.5157849341630936, "epoch": 0.01139, "frac_reward_zero_std": 0.0, "grad_norm": 0.02399151585996151, "kl": 0.14541830914095044, "learning_rate": 7.999549574274814e-06, "loss": -0.089, "num_tokens": 30520158.0, "reward": 0.11646765470504761, "reward_std": 1.281295657157898, "rewards/rollout_reward_func/mean": 0.11646765470504761, "rewards/rollout_reward_func/std": 1.281295657157898, "sampling/importance_sampling_ratio/max": 1.1922634840011597, "sampling/importance_sampling_ratio/mean": 0.4352957606315613, "sampling/importance_sampling_ratio/min": 4.1901108488673344e-05, "sampling/sampling_logp_difference/max": 1.9321300983428955, "sampling/sampling_logp_difference/mean": 0.3634474277496338, "step": 1139, "step_time": 34.50415546099248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.5044014006853104, "epoch": 0.0114, "grad_norm": 0.018590711057186127, "kl": 0.15169746754691005, "learning_rate": 7.999548757196645e-06, "loss": -0.0891, "step": 1140, "step_time": 14.883146989988745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 4.307692527770996, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0439697364345193, "epoch": 0.01141, "frac_reward_zero_std": 0.0, "grad_norm": 0.11529025435447693, "kl": 0.528473187237978, "learning_rate": 7.999547939378108e-06, "loss": -0.062, "num_tokens": 30572111.0, "reward": 0.9134259819984436, "reward_std": 1.268093466758728, "rewards/rollout_reward_func/mean": 0.9134259819984436, "rewards/rollout_reward_func/std": 1.268093466758728, "sampling/importance_sampling_ratio/max": 1.1175018548965454, "sampling/importance_sampling_ratio/mean": 0.7822005152702332, "sampling/importance_sampling_ratio/min": 8.967313078755978e-06, "sampling/sampling_logp_difference/max": 2.087947368621826, "sampling/sampling_logp_difference/mean": 0.2454698532819748, "step": 1141, "step_time": 25.992034593014978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0382545399479568, "epoch": 0.01142, "grad_norm": 0.10953725129365921, "kl": 0.5100617054849863, "learning_rate": 7.999547120819201e-06, "loss": -0.0625, "step": 1142, "step_time": 12.633786143996986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.71875, "completions/mean_terminated_length": 4.119999885559082, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4016475724056363, "epoch": 0.01143, "frac_reward_zero_std": 0.0, "grad_norm": 0.07469052076339722, "kl": 0.7537874644622207, "learning_rate": 7.99954630151993e-06, "loss": -0.0802, "num_tokens": 30628509.0, "reward": 0.8031229972839355, "reward_std": 1.2181990146636963, "rewards/rollout_reward_func/mean": 0.8031229972839355, "rewards/rollout_reward_func/std": 1.2181990146636963, "sampling/importance_sampling_ratio/max": 1.3462599515914917, "sampling/importance_sampling_ratio/mean": 0.7648219466209412, "sampling/importance_sampling_ratio/min": 3.3585088203835767e-06, "sampling/sampling_logp_difference/max": 1.9414551258087158, "sampling/sampling_logp_difference/mean": 0.2996438145637512, "step": 1143, "step_time": 29.19863337399147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3980871513485909, "epoch": 0.01144, "grad_norm": 0.06680317968130112, "kl": 0.7243066770024598, "learning_rate": 7.99954548148029e-06, "loss": -0.0803, "step": 1144, "step_time": 12.961874650995014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.96875, "completions/mean_terminated_length": 6.217391490936279, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2580591402947903, "epoch": 0.01145, "frac_reward_zero_std": 0.0, "grad_norm": 0.07777944207191467, "kl": 0.20232511218637228, "learning_rate": 7.999544660700282e-06, "loss": -0.075, "num_tokens": 30683152.0, "reward": -0.11812744289636612, "reward_std": 1.0051411390304565, "rewards/rollout_reward_func/mean": -0.11812744289636612, "rewards/rollout_reward_func/std": 1.0051411390304565, "sampling/importance_sampling_ratio/max": 1.2327797412872314, "sampling/importance_sampling_ratio/mean": 0.5654770135879517, "sampling/importance_sampling_ratio/min": 2.4171071810741296e-09, "sampling/sampling_logp_difference/max": 2.122671604156494, "sampling/sampling_logp_difference/mean": 0.501248836517334, "step": 1145, "step_time": 30.01279611897189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.2558094561100006, "epoch": 0.01146, "grad_norm": 0.07727237045764923, "kl": 0.20285151340067387, "learning_rate": 7.999543839179908e-06, "loss": -0.0752, "step": 1146, "step_time": 13.826150085995323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 10.15625, "completions/mean_terminated_length": 6.650000095367432, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.6554125398397446, "epoch": 0.01147, "frac_reward_zero_std": 0.0, "grad_norm": 0.047761667519807816, "kl": 0.17345353635028005, "learning_rate": 7.999543016919168e-06, "loss": -0.0942, "num_tokens": 30743719.0, "reward": 0.2783471941947937, "reward_std": 1.3103965520858765, "rewards/rollout_reward_func/mean": 0.2783471941947937, "rewards/rollout_reward_func/std": 1.3103965520858765, "sampling/importance_sampling_ratio/max": 1.1618878841400146, "sampling/importance_sampling_ratio/mean": 0.4663870930671692, "sampling/importance_sampling_ratio/min": 1.0947708688036073e-05, "sampling/sampling_logp_difference/max": 1.803712248802185, "sampling/sampling_logp_difference/mean": 0.38916701078414917, "step": 1147, "step_time": 31.59881783099263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.654322639107704, "epoch": 0.01148, "grad_norm": 0.04774453490972519, "kl": 0.17364157270640135, "learning_rate": 7.99954219391806e-06, "loss": -0.0942, "step": 1148, "step_time": 12.643626160992426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 7.125, "completions/mean_terminated_length": 5.076923370361328, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7217710353434086, "epoch": 0.01149, "frac_reward_zero_std": 0.0, "grad_norm": 0.10136134177446365, "kl": 0.3239452764391899, "learning_rate": 7.999541370176585e-06, "loss": -0.0629, "num_tokens": 30799718.0, "reward": 0.34456098079681396, "reward_std": 1.108950138092041, "rewards/rollout_reward_func/mean": 0.34456098079681396, "rewards/rollout_reward_func/std": 1.108950138092041, "sampling/importance_sampling_ratio/max": 1.407291054725647, "sampling/importance_sampling_ratio/mean": 0.7590674161911011, "sampling/importance_sampling_ratio/min": 1.2127522097671317e-07, "sampling/sampling_logp_difference/max": 2.7020373344421387, "sampling/sampling_logp_difference/mean": 0.3175085783004761, "step": 1149, "step_time": 30.659973344023456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7168236104771495, "epoch": 0.0115, "grad_norm": 0.09752672165632248, "kl": 0.3220280222594738, "learning_rate": 7.999540545694743e-06, "loss": -0.0629, "step": 1150, "step_time": 14.1981053329946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.0625, "completions/mean_terminated_length": 4.559999942779541, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.539028437808156, "epoch": 0.01151, "frac_reward_zero_std": 0.25, "grad_norm": 0.09224138408899307, "kl": 0.639981223270297, "learning_rate": 7.999539720472537e-06, "loss": -0.0634, "num_tokens": 30851104.0, "reward": 0.8207297921180725, "reward_std": 1.0648674964904785, "rewards/rollout_reward_func/mean": 0.8207297921180725, "rewards/rollout_reward_func/std": 1.0648674964904785, "sampling/importance_sampling_ratio/max": 1.2798012495040894, "sampling/importance_sampling_ratio/mean": 0.7660243511199951, "sampling/importance_sampling_ratio/min": 5.087119006930152e-06, "sampling/sampling_logp_difference/max": 1.9637565612792969, "sampling/sampling_logp_difference/mean": 0.3383559286594391, "step": 1151, "step_time": 26.761956342990743 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "entropy": 1.5401544589549303, "epoch": 0.01152, "grad_norm": 0.07018952071666718, "kl": 0.5901477606967092, "learning_rate": 7.999538894509965e-06, "loss": -0.0636, "step": 1152, "step_time": 12.725511433032807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 8.28125, "completions/mean_terminated_length": 4.7727274894714355, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.123797595500946, "epoch": 0.01153, "frac_reward_zero_std": 0.0, "grad_norm": 0.18960343301296234, "kl": 1.043368306942284, "learning_rate": 7.999538067807026e-06, "loss": -0.0701, "num_tokens": 30909701.0, "reward": 0.517935574054718, "reward_std": 1.2432582378387451, "rewards/rollout_reward_func/mean": 0.517935574054718, "rewards/rollout_reward_func/std": 1.2432581186294556, "sampling/importance_sampling_ratio/max": 1.1823804378509521, "sampling/importance_sampling_ratio/mean": 0.5830069780349731, "sampling/importance_sampling_ratio/min": 3.19336868415121e-06, "sampling/sampling_logp_difference/max": 2.0624375343322754, "sampling/sampling_logp_difference/mean": 0.3597850501537323, "step": 1153, "step_time": 33.75103666697396 }, { "clip_ratio/high_max": 0.019323671702295542, "clip_ratio/high_mean": 0.009661835851147771, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009661835851147771, "entropy": 2.120434492826462, "epoch": 0.01154, "grad_norm": 0.19092024862766266, "kl": 0.7305566687136889, "learning_rate": 7.999537240363722e-06, "loss": -0.0711, "step": 1154, "step_time": 15.749998847983079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.875, "completions/mean_terminated_length": 5.599999904632568, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.006225436925888, "epoch": 0.01155, "frac_reward_zero_std": 0.0, "grad_norm": 0.07593078911304474, "kl": 0.22770039876922965, "learning_rate": 7.999536412180054e-06, "loss": -0.0675, "num_tokens": 30973761.0, "reward": 0.26096856594085693, "reward_std": 1.2562141418457031, "rewards/rollout_reward_func/mean": 0.26096856594085693, "rewards/rollout_reward_func/std": 1.2562140226364136, "sampling/importance_sampling_ratio/max": 1.2588112354278564, "sampling/importance_sampling_ratio/mean": 0.566112756729126, "sampling/importance_sampling_ratio/min": 9.893448805087246e-06, "sampling/sampling_logp_difference/max": 1.9582960605621338, "sampling/sampling_logp_difference/mean": 0.32991015911102295, "step": 1155, "step_time": 34.83790388701891 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "entropy": 2.004278637468815, "epoch": 0.01156, "grad_norm": 0.0706413984298706, "kl": 0.22220681607723236, "learning_rate": 7.99953558325602e-06, "loss": -0.0674, "step": 1156, "step_time": 15.82873406202998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.09375, "completions/mean_terminated_length": 7.500000476837158, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.09649312030524, "epoch": 0.01157, "frac_reward_zero_std": 0.0, "grad_norm": 0.04996068775653839, "kl": 0.1668397607281804, "learning_rate": 7.999534753591622e-06, "loss": -0.097, "num_tokens": 31031758.0, "reward": 0.20199346542358398, "reward_std": 1.2628626823425293, "rewards/rollout_reward_func/mean": 0.20199346542358398, "rewards/rollout_reward_func/std": 1.2628626823425293, "sampling/importance_sampling_ratio/max": 1.331475019454956, "sampling/importance_sampling_ratio/mean": 0.5790209770202637, "sampling/importance_sampling_ratio/min": 5.164271215107874e-07, "sampling/sampling_logp_difference/max": 1.905250906944275, "sampling/sampling_logp_difference/mean": 0.3458172082901001, "step": 1157, "step_time": 32.15730526900734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.1003597751259804, "epoch": 0.01158, "grad_norm": 0.05044679716229439, "kl": 0.1643824204802513, "learning_rate": 7.999533923186858e-06, "loss": -0.097, "step": 1158, "step_time": 13.784402981007588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.34375, "completions/mean_terminated_length": 5.34782600402832, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8679540222510695, "epoch": 0.01159, "frac_reward_zero_std": 0.0, "grad_norm": 0.05766094848513603, "kl": 0.23840922769159079, "learning_rate": 7.99953309204173e-06, "loss": -0.1038, "num_tokens": 31089633.0, "reward": 0.6265707612037659, "reward_std": 1.2332732677459717, "rewards/rollout_reward_func/mean": 0.6265707612037659, "rewards/rollout_reward_func/std": 1.2332732677459717, "sampling/importance_sampling_ratio/max": 1.2603431940078735, "sampling/importance_sampling_ratio/mean": 0.5880229473114014, "sampling/importance_sampling_ratio/min": 6.514012056868523e-06, "sampling/sampling_logp_difference/max": 1.7598682641983032, "sampling/sampling_logp_difference/mean": 0.34830617904663086, "step": 1159, "step_time": 32.35901464401104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8662337809801102, "epoch": 0.0116, "grad_norm": 0.05709770321846008, "kl": 0.23847872577607632, "learning_rate": 7.999532260156239e-06, "loss": -0.1038, "step": 1160, "step_time": 14.285139749990776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.125, "completions/mean_terminated_length": 4.545454502105713, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.0361996591091156, "epoch": 0.01161, "frac_reward_zero_std": 0.0, "grad_norm": 0.19969427585601807, "kl": 0.7848611008375883, "learning_rate": 7.999531427530382e-06, "loss": -0.0465, "num_tokens": 31143998.0, "reward": 0.6626409292221069, "reward_std": 1.3155076503753662, "rewards/rollout_reward_func/mean": 0.6626409292221069, "rewards/rollout_reward_func/std": 1.3155075311660767, "sampling/importance_sampling_ratio/max": 1.6067825555801392, "sampling/importance_sampling_ratio/mean": 0.6627009510993958, "sampling/importance_sampling_ratio/min": 1.7945986030554195e-08, "sampling/sampling_logp_difference/max": 2.0761520862579346, "sampling/sampling_logp_difference/mean": 0.410585880279541, "step": 1161, "step_time": 29.54959483098355 }, { "clip_ratio/high_max": 0.01145833358168602, "clip_ratio/high_mean": 0.00572916679084301, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00572916679084301, "entropy": 2.034460633993149, "epoch": 0.01162, "grad_norm": 0.1534283310174942, "kl": 0.6996578145772219, "learning_rate": 7.999530594164162e-06, "loss": -0.0478, "step": 1162, "step_time": 12.450106617980055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.71875, "completions/mean_terminated_length": 4.349999904632568, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.0792348384857178, "epoch": 0.01163, "frac_reward_zero_std": 0.0, "grad_norm": 0.1373637169599533, "kl": 0.26650508865714073, "learning_rate": 7.999529760057578e-06, "loss": -0.0856, "num_tokens": 31207170.0, "reward": 0.02116948366165161, "reward_std": 1.231287956237793, "rewards/rollout_reward_func/mean": 0.02116948366165161, "rewards/rollout_reward_func/std": 1.2312878370285034, "sampling/importance_sampling_ratio/max": 1.368584156036377, "sampling/importance_sampling_ratio/mean": 0.5645899772644043, "sampling/importance_sampling_ratio/min": 5.610086284413285e-10, "sampling/sampling_logp_difference/max": 2.376274585723877, "sampling/sampling_logp_difference/mean": 0.3578055500984192, "step": 1163, "step_time": 33.45731129599153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0904111564159393, "epoch": 0.01164, "grad_norm": 0.11827000975608826, "kl": 0.26218626322224736, "learning_rate": 7.999528925210631e-06, "loss": -0.0859, "step": 1164, "step_time": 15.608804136005347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.75, "completions/mean_terminated_length": 4.689655303955078, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2602137131616473, "epoch": 0.01165, "frac_reward_zero_std": 0.25, "grad_norm": 0.03968183696269989, "kl": 0.5404568165540695, "learning_rate": 7.999528089623321e-06, "loss": -0.0625, "num_tokens": 31255469.0, "reward": 0.9220969080924988, "reward_std": 1.250258207321167, "rewards/rollout_reward_func/mean": 0.9220969080924988, "rewards/rollout_reward_func/std": 1.250258207321167, "sampling/importance_sampling_ratio/max": 1.1619912385940552, "sampling/importance_sampling_ratio/mean": 0.8247009515762329, "sampling/importance_sampling_ratio/min": 2.508014858904062e-06, "sampling/sampling_logp_difference/max": 2.1579952239990234, "sampling/sampling_logp_difference/mean": 0.25094929337501526, "step": 1165, "step_time": 26.2650987420202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.266124738380313, "epoch": 0.01166, "grad_norm": 0.03901486098766327, "kl": 0.5294530410319567, "learning_rate": 7.999527253295647e-06, "loss": -0.0625, "step": 1166, "step_time": 12.481670534994919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 8.6875, "completions/mean_terminated_length": 4.857142925262451, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.446935711428523, "epoch": 0.01167, "frac_reward_zero_std": 0.25, "grad_norm": 0.01888754777610302, "kl": 0.24974808283150196, "learning_rate": 7.99952641622761e-06, "loss": -0.069, "num_tokens": 31311425.0, "reward": 0.528449296951294, "reward_std": 1.337836503982544, "rewards/rollout_reward_func/mean": 0.528449296951294, "rewards/rollout_reward_func/std": 1.3378366231918335, "sampling/importance_sampling_ratio/max": 1.2185994386672974, "sampling/importance_sampling_ratio/mean": 0.6009622812271118, "sampling/importance_sampling_ratio/min": 5.906484261686273e-07, "sampling/sampling_logp_difference/max": 1.957965612411499, "sampling/sampling_logp_difference/mean": 0.38878118991851807, "step": 1167, "step_time": 29.114442019999842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.4520264007151127, "epoch": 0.01168, "grad_norm": 0.019396044313907623, "kl": 0.26219878904521465, "learning_rate": 7.999525578419213e-06, "loss": -0.0689, "step": 1168, "step_time": 12.733182362979278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.46875, "completions/mean_terminated_length": 5.5217390060424805, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.032285839319229, "epoch": 0.01169, "frac_reward_zero_std": 0.0, "grad_norm": 0.17591017484664917, "kl": 0.22999340295791626, "learning_rate": 7.999524739870451e-06, "loss": -0.0814, "num_tokens": 31363937.0, "reward": 0.33001673221588135, "reward_std": 1.346757411956787, "rewards/rollout_reward_func/mean": 0.33001673221588135, "rewards/rollout_reward_func/std": 1.3467572927474976, "sampling/importance_sampling_ratio/max": 1.2643133401870728, "sampling/importance_sampling_ratio/mean": 0.5763413310050964, "sampling/importance_sampling_ratio/min": 5.693500952475006e-06, "sampling/sampling_logp_difference/max": 2.0515828132629395, "sampling/sampling_logp_difference/mean": 0.3515523374080658, "step": 1169, "step_time": 31.90140485399752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0346597135066986, "epoch": 0.0117, "grad_norm": 0.1770981401205063, "kl": 0.2272342275828123, "learning_rate": 7.999523900581328e-06, "loss": -0.0814, "step": 1170, "step_time": 14.15909983702295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.9375, "completions/mean_terminated_length": 6.17391300201416, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.4258459210395813, "epoch": 0.01171, "frac_reward_zero_std": 0.0, "grad_norm": 0.11188142001628876, "kl": 0.2522993269376457, "learning_rate": 7.999523060551842e-06, "loss": -0.0757, "num_tokens": 31417883.0, "reward": 0.6633484959602356, "reward_std": 1.3346856832504272, "rewards/rollout_reward_func/mean": 0.6633484959602356, "rewards/rollout_reward_func/std": 1.3346856832504272, "sampling/importance_sampling_ratio/max": 1.4127438068389893, "sampling/importance_sampling_ratio/mean": 0.5380508899688721, "sampling/importance_sampling_ratio/min": 3.2525608162359276e-07, "sampling/sampling_logp_difference/max": 2.1365156173706055, "sampling/sampling_logp_difference/mean": 0.44244834780693054, "step": 1171, "step_time": 28.810710039993864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.4192308634519577, "epoch": 0.01172, "grad_norm": 0.07792581617832184, "kl": 0.25055402237921953, "learning_rate": 7.999522219781996e-06, "loss": -0.0761, "step": 1172, "step_time": 13.538625626984867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.1875, "completions/mean_terminated_length": 5.099999904632568, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.1836121417582035, "epoch": 0.01173, "frac_reward_zero_std": 0.0, "grad_norm": 0.038683950901031494, "kl": 0.3094293794129044, "learning_rate": 7.999521378271788e-06, "loss": -0.0796, "num_tokens": 31476624.0, "reward": 0.5097702741622925, "reward_std": 1.3032934665679932, "rewards/rollout_reward_func/mean": 0.5097702741622925, "rewards/rollout_reward_func/std": 1.3032933473587036, "sampling/importance_sampling_ratio/max": 1.3581371307373047, "sampling/importance_sampling_ratio/mean": 0.5183039903640747, "sampling/importance_sampling_ratio/min": 9.732650596561143e-07, "sampling/sampling_logp_difference/max": 1.8909629583358765, "sampling/sampling_logp_difference/mean": 0.3545587360858917, "step": 1173, "step_time": 31.879266495030606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.1842852868139744, "epoch": 0.01174, "grad_norm": 0.03973624110221863, "kl": 0.3221689786296338, "learning_rate": 7.99952053602122e-06, "loss": -0.0797, "step": 1174, "step_time": 14.478335716034053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.125, "completions/mean_terminated_length": 4.545454502105713, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9602848626673222, "epoch": 0.01175, "frac_reward_zero_std": 0.25, "grad_norm": 0.0334438681602478, "kl": 0.15817559510469437, "learning_rate": 7.999519693030286e-06, "loss": -0.0486, "num_tokens": 31531143.0, "reward": 0.690352201461792, "reward_std": 1.3360716104507446, "rewards/rollout_reward_func/mean": 0.690352201461792, "rewards/rollout_reward_func/std": 1.3360716104507446, "sampling/importance_sampling_ratio/max": 1.351134181022644, "sampling/importance_sampling_ratio/mean": 0.6724292039871216, "sampling/importance_sampling_ratio/min": 2.0567867409226892e-07, "sampling/sampling_logp_difference/max": 2.213564872741699, "sampling/sampling_logp_difference/mean": 0.3189319968223572, "step": 1175, "step_time": 32.59503838297678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9554362846538424, "epoch": 0.01176, "grad_norm": 0.035646338015794754, "kl": 0.15791993448510766, "learning_rate": 7.999518849298995e-06, "loss": -0.0486, "step": 1176, "step_time": 15.097432909009513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.71875, "completions/mean_terminated_length": 5.807692527770996, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7907247096300125, "epoch": 0.01177, "frac_reward_zero_std": 0.0, "grad_norm": 0.07858292013406754, "kl": 0.19481358211487532, "learning_rate": 7.999518004827344e-06, "loss": -0.0802, "num_tokens": 31572876.0, "reward": 0.5775394439697266, "reward_std": 1.4855854511260986, "rewards/rollout_reward_func/mean": 0.5775394439697266, "rewards/rollout_reward_func/std": 1.4855854511260986, "sampling/importance_sampling_ratio/max": 1.1605304479599, "sampling/importance_sampling_ratio/mean": 0.7006841897964478, "sampling/importance_sampling_ratio/min": 5.755834354204126e-05, "sampling/sampling_logp_difference/max": 2.0791115760803223, "sampling/sampling_logp_difference/mean": 0.2707636058330536, "step": 1177, "step_time": 25.30125063401647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7881303578615189, "epoch": 0.01178, "grad_norm": 0.07345212250947952, "kl": 0.19719976373016834, "learning_rate": 7.99951715961533e-06, "loss": -0.0805, "step": 1178, "step_time": 12.700640584051143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 9.3125, "completions/mean_terminated_length": 4.111111164093018, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.954313383437693, "epoch": 0.01179, "frac_reward_zero_std": 0.0, "grad_norm": 0.05097782984375954, "kl": 0.7123480308800936, "learning_rate": 7.999516313662957e-06, "loss": -0.06, "num_tokens": 31640222.0, "reward": -0.16959929466247559, "reward_std": 1.0052556991577148, "rewards/rollout_reward_func/mean": -0.16959929466247559, "rewards/rollout_reward_func/std": 1.0052556991577148, "sampling/importance_sampling_ratio/max": 1.1806440353393555, "sampling/importance_sampling_ratio/mean": 0.4118434190750122, "sampling/importance_sampling_ratio/min": 1.7053077954187756e-06, "sampling/sampling_logp_difference/max": 2.3977487087249756, "sampling/sampling_logp_difference/mean": 0.4871542453765869, "step": 1179, "step_time": 35.181596460955916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.952322106808424, "epoch": 0.0118, "grad_norm": 0.053400829434394836, "kl": 0.7127014379948378, "learning_rate": 7.999515466970224e-06, "loss": -0.06, "step": 1180, "step_time": 15.256609272997594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 7.84375, "completions/mean_terminated_length": 4.65217399597168, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7195039056241512, "epoch": 0.01181, "frac_reward_zero_std": 0.0, "grad_norm": 0.13326327502727509, "kl": 0.23658557562157512, "learning_rate": 7.99951461953713e-06, "loss": -0.0434, "num_tokens": 31701650.0, "reward": 0.035741060972213745, "reward_std": 1.2573018074035645, "rewards/rollout_reward_func/mean": 0.035741060972213745, "rewards/rollout_reward_func/std": 1.2573018074035645, "sampling/importance_sampling_ratio/max": 1.3654260635375977, "sampling/importance_sampling_ratio/mean": 0.6345268487930298, "sampling/importance_sampling_ratio/min": 2.88132895320814e-07, "sampling/sampling_logp_difference/max": 2.1280131340026855, "sampling/sampling_logp_difference/mean": 0.33209341764450073, "step": 1181, "step_time": 33.415890591044445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7232664451003075, "epoch": 0.01182, "grad_norm": 0.1419399082660675, "kl": 0.2365031612571329, "learning_rate": 7.999513771363676e-06, "loss": -0.0436, "step": 1182, "step_time": 15.461216952942777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 11.09375, "completions/mean_terminated_length": 5.533333778381348, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.6571429520845413, "epoch": 0.01183, "frac_reward_zero_std": 0.0, "grad_norm": 0.052730441093444824, "kl": 0.16826906776987016, "learning_rate": 7.999512922449865e-06, "loss": -0.0905, "num_tokens": 31768271.0, "reward": -0.012259617447853088, "reward_std": 1.0839688777923584, "rewards/rollout_reward_func/mean": -0.012259617447853088, "rewards/rollout_reward_func/std": 1.0839688777923584, "sampling/importance_sampling_ratio/max": 1.3812062740325928, "sampling/importance_sampling_ratio/mean": 0.42097800970077515, "sampling/importance_sampling_ratio/min": 1.7744913520800765e-06, "sampling/sampling_logp_difference/max": 2.27079176902771, "sampling/sampling_logp_difference/mean": 0.38368284702301025, "step": 1183, "step_time": 35.387689252063865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.6587533950805664, "epoch": 0.01184, "grad_norm": 0.05679849535226822, "kl": 0.16518756235018373, "learning_rate": 7.999512072795693e-06, "loss": -0.0906, "step": 1184, "step_time": 14.20199014103855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.9375, "completions/mean_terminated_length": 5.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5397808067500591, "epoch": 0.01185, "frac_reward_zero_std": 0.0, "grad_norm": 0.11719679832458496, "kl": 0.43608060711994767, "learning_rate": 7.999511222401162e-06, "loss": -0.0721, "num_tokens": 31821130.0, "reward": 0.3252323269844055, "reward_std": 1.3305044174194336, "rewards/rollout_reward_func/mean": 0.3252323269844055, "rewards/rollout_reward_func/std": 1.3305044174194336, "sampling/importance_sampling_ratio/max": 1.7659555673599243, "sampling/importance_sampling_ratio/mean": 0.7053008079528809, "sampling/importance_sampling_ratio/min": 4.12480494560441e-06, "sampling/sampling_logp_difference/max": 2.3148584365844727, "sampling/sampling_logp_difference/mean": 0.35524219274520874, "step": 1185, "step_time": 27.768190155999037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "entropy": 1.5444974349811673, "epoch": 0.01186, "grad_norm": 0.11359264701604843, "kl": 0.4486635294742882, "learning_rate": 7.999510371266273e-06, "loss": -0.0722, "step": 1186, "step_time": 12.653401004965417 }, { "clip_ratio/high_max": 0.007352941203862429, "clip_ratio/high_mean": 0.0036764706019312143, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0036764706019312143, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.25, "completions/mean_terminated_length": 5.629629611968994, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7286012033000588, "epoch": 0.01187, "frac_reward_zero_std": 0.0, "grad_norm": 0.05460545793175697, "kl": 0.5285473633557558, "learning_rate": 7.999509519391025e-06, "loss": -0.0685, "num_tokens": 31871380.0, "reward": 0.6787666082382202, "reward_std": 1.4468084573745728, "rewards/rollout_reward_func/mean": 0.6787666082382202, "rewards/rollout_reward_func/std": 1.4468084573745728, "sampling/importance_sampling_ratio/max": 1.15492844581604, "sampling/importance_sampling_ratio/mean": 0.7119982242584229, "sampling/importance_sampling_ratio/min": 1.611259570211132e-08, "sampling/sampling_logp_difference/max": 2.0997304916381836, "sampling/sampling_logp_difference/mean": 0.35777604579925537, "step": 1187, "step_time": 25.571059356007027 }, { "clip_ratio/high_max": 0.007352941203862429, "clip_ratio/high_mean": 0.0036764706019312143, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0036764706019312143, "entropy": 1.7277824627235532, "epoch": 0.01188, "grad_norm": 0.05472825840115547, "kl": 0.5303734764456749, "learning_rate": 7.999508666775417e-06, "loss": -0.0685, "step": 1188, "step_time": 13.254961988015566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.28125, "completions/mean_terminated_length": 4.839999675750732, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.532440321519971, "epoch": 0.01189, "frac_reward_zero_std": 0.25, "grad_norm": 0.07592929154634476, "kl": 0.2576689478009939, "learning_rate": 7.999507813419454e-06, "loss": -0.0713, "num_tokens": 31921011.0, "reward": 0.9264465570449829, "reward_std": 0.9678258299827576, "rewards/rollout_reward_func/mean": 0.9264465570449829, "rewards/rollout_reward_func/std": 0.9678257703781128, "sampling/importance_sampling_ratio/max": 1.2098314762115479, "sampling/importance_sampling_ratio/mean": 0.7279890775680542, "sampling/importance_sampling_ratio/min": 3.282889053934923e-07, "sampling/sampling_logp_difference/max": 2.1461567878723145, "sampling/sampling_logp_difference/mean": 0.2923475205898285, "step": 1189, "step_time": 32.49002824095078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5340370209887624, "epoch": 0.0119, "grad_norm": 0.08407475054264069, "kl": 0.2605892391875386, "learning_rate": 7.999506959323131e-06, "loss": -0.0711, "step": 1190, "step_time": 13.279373428988038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.59375, "completions/mean_terminated_length": 4.304347991943359, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6622909605503082, "epoch": 0.01191, "frac_reward_zero_std": 0.0, "grad_norm": 0.09365557134151459, "kl": 0.626086093718186, "learning_rate": 7.999506104486449e-06, "loss": -0.0801, "num_tokens": 31985853.0, "reward": 0.3145141005516052, "reward_std": 1.2377145290374756, "rewards/rollout_reward_func/mean": 0.3145141005516052, "rewards/rollout_reward_func/std": 1.237714409828186, "sampling/importance_sampling_ratio/max": 1.5315427780151367, "sampling/importance_sampling_ratio/mean": 0.6375303268432617, "sampling/importance_sampling_ratio/min": 0.00010647126327967271, "sampling/sampling_logp_difference/max": 1.9017075300216675, "sampling/sampling_logp_difference/mean": 0.3156123161315918, "step": 1191, "step_time": 35.09260274798726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6658582836389542, "epoch": 0.01192, "grad_norm": 0.09033451974391937, "kl": 0.6207953218836337, "learning_rate": 7.99950524890941e-06, "loss": -0.0806, "step": 1192, "step_time": 16.361919641989516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.9375, "completions/mean_terminated_length": 5.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9803903540596366, "epoch": 0.01193, "frac_reward_zero_std": 0.25, "grad_norm": 0.06862825900316238, "kl": 0.3664306802675128, "learning_rate": 7.999504392592015e-06, "loss": -0.0594, "num_tokens": 32040596.0, "reward": 0.7492655515670776, "reward_std": 1.2900429964065552, "rewards/rollout_reward_func/mean": 0.7492655515670776, "rewards/rollout_reward_func/std": 1.2900429964065552, "sampling/importance_sampling_ratio/max": 1.1901371479034424, "sampling/importance_sampling_ratio/mean": 0.629554808139801, "sampling/importance_sampling_ratio/min": 4.462742708710721e-08, "sampling/sampling_logp_difference/max": 2.173858165740967, "sampling/sampling_logp_difference/mean": 0.36129698157310486, "step": 1193, "step_time": 30.630819758051075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9889053385704756, "epoch": 0.01194, "grad_norm": 0.07091491669416428, "kl": 0.3677344946190715, "learning_rate": 7.999503535534264e-06, "loss": -0.0596, "step": 1194, "step_time": 13.897228677058592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.21875, "completions/mean_terminated_length": 5.192307949066162, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6986592523753643, "epoch": 0.01195, "frac_reward_zero_std": 0.0, "grad_norm": 0.07008872926235199, "kl": 0.26895804703235626, "learning_rate": 7.999502677736154e-06, "loss": -0.081, "num_tokens": 32091781.0, "reward": 0.9240387678146362, "reward_std": 1.276707649230957, "rewards/rollout_reward_func/mean": 0.9240387678146362, "rewards/rollout_reward_func/std": 1.276707649230957, "sampling/importance_sampling_ratio/max": 1.2534865140914917, "sampling/importance_sampling_ratio/mean": 0.7628052830696106, "sampling/importance_sampling_ratio/min": 9.151751328317914e-06, "sampling/sampling_logp_difference/max": 2.0801472663879395, "sampling/sampling_logp_difference/mean": 0.31067973375320435, "step": 1195, "step_time": 27.142792325030314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.705754511989653, "epoch": 0.01196, "grad_norm": 0.08336542546749115, "kl": 0.26666776835918427, "learning_rate": 7.999501819197687e-06, "loss": -0.081, "step": 1196, "step_time": 11.852382883080281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 8.375, "completions/mean_terminated_length": 5.833333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.3729308247566223, "epoch": 0.01197, "frac_reward_zero_std": 0.0, "grad_norm": 0.04928111657500267, "kl": 0.3815515413880348, "learning_rate": 7.999500959918865e-06, "loss": -0.082, "num_tokens": 32156213.0, "reward": 0.6734275817871094, "reward_std": 1.3078868389129639, "rewards/rollout_reward_func/mean": 0.6734275817871094, "rewards/rollout_reward_func/std": 1.3078868389129639, "sampling/importance_sampling_ratio/max": 1.2633055448532104, "sampling/importance_sampling_ratio/mean": 0.6171059608459473, "sampling/importance_sampling_ratio/min": 1.357982313265893e-07, "sampling/sampling_logp_difference/max": 2.1826701164245605, "sampling/sampling_logp_difference/mean": 0.40713992714881897, "step": 1197, "step_time": 34.657169094949495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.3723246082663536, "epoch": 0.01198, "grad_norm": 0.05456409975886345, "kl": 0.3690104056149721, "learning_rate": 7.999500099899686e-06, "loss": -0.0822, "step": 1198, "step_time": 16.18389147997368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.6875, "completions/mean_terminated_length": 5.3684210777282715, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.886243313550949, "epoch": 0.01199, "frac_reward_zero_std": 0.0, "grad_norm": 0.09124673157930374, "kl": 0.14624141482636333, "learning_rate": 7.999499239140151e-06, "loss": -0.0965, "num_tokens": 32210843.0, "reward": 0.2504453659057617, "reward_std": 1.3269269466400146, "rewards/rollout_reward_func/mean": 0.2504453659057617, "rewards/rollout_reward_func/std": 1.3269269466400146, "sampling/importance_sampling_ratio/max": 1.1838622093200684, "sampling/importance_sampling_ratio/mean": 0.5181049108505249, "sampling/importance_sampling_ratio/min": 1.4217552468664962e-08, "sampling/sampling_logp_difference/max": 2.174865484237671, "sampling/sampling_logp_difference/mean": 0.47894543409347534, "step": 1199, "step_time": 31.45351388803101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.885984107851982, "epoch": 0.012, "grad_norm": 0.09249264001846313, "kl": 0.14428062201477587, "learning_rate": 7.99949837764026e-06, "loss": -0.0966, "step": 1200, "step_time": 13.856396877032239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 8.15625, "completions/mean_terminated_length": 4.047619342803955, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.0792349725961685, "epoch": 0.01201, "frac_reward_zero_std": 0.25, "grad_norm": 0.04887494072318077, "kl": 0.16294992435723543, "learning_rate": 7.999497515400014e-06, "loss": -0.0692, "num_tokens": 32272831.0, "reward": 0.8122738003730774, "reward_std": 1.344281792640686, "rewards/rollout_reward_func/mean": 0.8122738003730774, "rewards/rollout_reward_func/std": 1.344281792640686, "sampling/importance_sampling_ratio/max": 1.2075833082199097, "sampling/importance_sampling_ratio/mean": 0.6409121751785278, "sampling/importance_sampling_ratio/min": 1.4072996101788249e-08, "sampling/sampling_logp_difference/max": 2.2288553714752197, "sampling/sampling_logp_difference/mean": 0.3865017294883728, "step": 1201, "step_time": 33.41948744605179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0811116956174374, "epoch": 0.01202, "grad_norm": 0.048445045948028564, "kl": 0.16328430827707052, "learning_rate": 7.99949665241941e-06, "loss": -0.0692, "step": 1202, "step_time": 13.996280033054063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.625, "completions/mean_terminated_length": 5.73913049697876, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5719758607447147, "epoch": 0.01203, "frac_reward_zero_std": 0.25, "grad_norm": 0.1557425707578659, "kl": 0.5147290844470263, "learning_rate": 7.999495788698454e-06, "loss": -0.0476, "num_tokens": 32330630.0, "reward": 0.4493391513824463, "reward_std": 1.347664475440979, "rewards/rollout_reward_func/mean": 0.4493391513824463, "rewards/rollout_reward_func/std": 1.347664475440979, "sampling/importance_sampling_ratio/max": 1.725486397743225, "sampling/importance_sampling_ratio/mean": 0.6537848711013794, "sampling/importance_sampling_ratio/min": 8.710744623385835e-06, "sampling/sampling_logp_difference/max": 1.9137824773788452, "sampling/sampling_logp_difference/mean": 0.3016033172607422, "step": 1203, "step_time": 34.409893277974334 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "entropy": 1.5733028948307037, "epoch": 0.01204, "grad_norm": 0.12364979833364487, "kl": 0.42481965757906437, "learning_rate": 7.999494924237141e-06, "loss": -0.0486, "step": 1204, "step_time": 15.19851209007902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 8.28125, "completions/mean_terminated_length": 5.26086950302124, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8728654524311423, "epoch": 0.01205, "frac_reward_zero_std": 0.0, "grad_norm": 0.14985540509223938, "kl": 0.7384305889718235, "learning_rate": 7.999494059035475e-06, "loss": -0.0699, "num_tokens": 32396028.0, "reward": 0.43504831194877625, "reward_std": 1.2637900114059448, "rewards/rollout_reward_func/mean": 0.43504831194877625, "rewards/rollout_reward_func/std": 1.2637900114059448, "sampling/importance_sampling_ratio/max": 1.210408329963684, "sampling/importance_sampling_ratio/mean": 0.6313702464103699, "sampling/importance_sampling_ratio/min": 2.757780066531268e-06, "sampling/sampling_logp_difference/max": 1.8579983711242676, "sampling/sampling_logp_difference/mean": 0.3199405074119568, "step": 1205, "step_time": 31.680071300972486 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0034722222480922937, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034722222480922937, "entropy": 1.8767142230644822, "epoch": 0.01206, "grad_norm": 0.14363840222358704, "kl": 0.6733369543217123, "learning_rate": 7.999493193093452e-06, "loss": -0.0704, "step": 1206, "step_time": 13.84138046199223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.125, "completions/mean_terminated_length": 4.4210524559021, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.167174495756626, "epoch": 0.01207, "frac_reward_zero_std": 0.0, "grad_norm": 0.0479525588452816, "kl": 0.18078247271478176, "learning_rate": 7.999492326411075e-06, "loss": -0.0859, "num_tokens": 32454692.0, "reward": -0.30245232582092285, "reward_std": 0.994107186794281, "rewards/rollout_reward_func/mean": -0.30245232582092285, "rewards/rollout_reward_func/std": 0.994107186794281, "sampling/importance_sampling_ratio/max": 1.2236194610595703, "sampling/importance_sampling_ratio/mean": 0.5556106567382812, "sampling/importance_sampling_ratio/min": 1.1715553227986675e-05, "sampling/sampling_logp_difference/max": 1.9490158557891846, "sampling/sampling_logp_difference/mean": 0.35287386178970337, "step": 1207, "step_time": 33.09875981602818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.1793458946049213, "epoch": 0.01208, "grad_norm": 0.04941130802035332, "kl": 0.17814619559794664, "learning_rate": 7.999491458988344e-06, "loss": -0.0857, "step": 1208, "step_time": 14.429124043002957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.8125, "completions/mean_terminated_length": 5.047619342803955, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.98946213722229, "epoch": 0.01209, "frac_reward_zero_std": 0.25, "grad_norm": 0.11375176161527634, "kl": 0.16818036697804928, "learning_rate": 7.999490590825262e-06, "loss": -0.0637, "num_tokens": 32519702.0, "reward": 0.23629291355609894, "reward_std": 1.2340302467346191, "rewards/rollout_reward_func/mean": 0.23629291355609894, "rewards/rollout_reward_func/std": 1.2340302467346191, "sampling/importance_sampling_ratio/max": 1.3254725933074951, "sampling/importance_sampling_ratio/mean": 0.6311454772949219, "sampling/importance_sampling_ratio/min": 7.413883577100933e-05, "sampling/sampling_logp_difference/max": 1.6930506229400635, "sampling/sampling_logp_difference/mean": 0.3100363314151764, "step": 1209, "step_time": 38.16371852494194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9920442998409271, "epoch": 0.0121, "grad_norm": 0.11380697786808014, "kl": 0.1647121999412775, "learning_rate": 7.999489721921822e-06, "loss": -0.0633, "step": 1210, "step_time": 15.932840385998134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.78125, "completions/mean_terminated_length": 4.5652174949646, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8106500059366226, "epoch": 0.01211, "frac_reward_zero_std": 0.0, "grad_norm": 0.08086801320314407, "kl": 0.36686061415821314, "learning_rate": 7.999488852278032e-06, "loss": -0.0966, "num_tokens": 32576477.0, "reward": 0.7657896280288696, "reward_std": 1.3312981128692627, "rewards/rollout_reward_func/mean": 0.7657896280288696, "rewards/rollout_reward_func/std": 1.3312979936599731, "sampling/importance_sampling_ratio/max": 1.3729406595230103, "sampling/importance_sampling_ratio/mean": 0.6346157193183899, "sampling/importance_sampling_ratio/min": 2.2095984604675323e-05, "sampling/sampling_logp_difference/max": 1.9243078231811523, "sampling/sampling_logp_difference/mean": 0.3137664794921875, "step": 1211, "step_time": 33.25817417097278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 1.8125076293945312, "epoch": 0.01212, "grad_norm": 0.06509093940258026, "kl": 0.36076891142874956, "learning_rate": 7.999487981893887e-06, "loss": -0.0966, "step": 1212, "step_time": 14.214387557032751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 8.59375, "completions/mean_terminated_length": 5.227272987365723, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.821017749607563, "epoch": 0.01213, "frac_reward_zero_std": 0.0, "grad_norm": 0.08729856461286545, "kl": 0.20271714450791478, "learning_rate": 7.999487110769388e-06, "loss": -0.0896, "num_tokens": 32632378.0, "reward": 0.6502155065536499, "reward_std": 1.3558915853500366, "rewards/rollout_reward_func/mean": 0.6502155065536499, "rewards/rollout_reward_func/std": 1.355891466140747, "sampling/importance_sampling_ratio/max": 1.2597694396972656, "sampling/importance_sampling_ratio/mean": 0.5736058950424194, "sampling/importance_sampling_ratio/min": 1.7773916624719277e-05, "sampling/sampling_logp_difference/max": 1.9698375463485718, "sampling/sampling_logp_difference/mean": 0.30470287799835205, "step": 1213, "step_time": 33.10853423696244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8225355371832848, "epoch": 0.01214, "grad_norm": 0.0796605795621872, "kl": 0.20921060326509178, "learning_rate": 7.999486238904537e-06, "loss": -0.09, "step": 1214, "step_time": 15.48401160503272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.84375, "completions/mean_terminated_length": 5.590909481048584, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.234466675668955, "epoch": 0.01215, "frac_reward_zero_std": 0.0, "grad_norm": 0.0512901246547699, "kl": 0.14712389092892408, "learning_rate": 7.999485366299335e-06, "loss": -0.0633, "num_tokens": 32687317.0, "reward": 0.5737326145172119, "reward_std": 1.315326452255249, "rewards/rollout_reward_func/mean": 0.5737326145172119, "rewards/rollout_reward_func/std": 1.315326452255249, "sampling/importance_sampling_ratio/max": 1.1975092887878418, "sampling/importance_sampling_ratio/mean": 0.5758040547370911, "sampling/importance_sampling_ratio/min": 1.937999286383274e-06, "sampling/sampling_logp_difference/max": 1.9979102611541748, "sampling/sampling_logp_difference/mean": 0.3753010630607605, "step": 1215, "step_time": 32.18501347498386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.235367923974991, "epoch": 0.01216, "grad_norm": 0.047409992665052414, "kl": 0.1496683293953538, "learning_rate": 7.999484492953778e-06, "loss": -0.0634, "step": 1216, "step_time": 13.809441204037284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 8.6875, "completions/mean_terminated_length": 4.857142925262451, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.2238616198301315, "epoch": 0.01217, "frac_reward_zero_std": 0.25, "grad_norm": 0.23456493020057678, "kl": 0.32834220584481955, "learning_rate": 7.99948361886787e-06, "loss": -0.0414, "num_tokens": 32747007.0, "reward": 0.22739286720752716, "reward_std": 1.2636678218841553, "rewards/rollout_reward_func/mean": 0.22739286720752716, "rewards/rollout_reward_func/std": 1.2636678218841553, "sampling/importance_sampling_ratio/max": 1.2405284643173218, "sampling/importance_sampling_ratio/mean": 0.5319148898124695, "sampling/importance_sampling_ratio/min": 1.6767029364928021e-06, "sampling/sampling_logp_difference/max": 1.7450580596923828, "sampling/sampling_logp_difference/mean": 0.3652515113353729, "step": 1217, "step_time": 36.00242711798637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 2.2311063706874847, "epoch": 0.01218, "grad_norm": 0.12009921669960022, "kl": 0.3318394022062421, "learning_rate": 7.99948274404161e-06, "loss": -0.042, "step": 1218, "step_time": 16.350801081978716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 9.4375, "completions/mean_terminated_length": 4.333333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.4026447646319866, "epoch": 0.01219, "frac_reward_zero_std": 0.0, "grad_norm": 0.03837737813591957, "kl": 0.40194797213189304, "learning_rate": 7.999481868474998e-06, "loss": -0.0948, "num_tokens": 32812676.0, "reward": 0.3397909998893738, "reward_std": 1.1733063459396362, "rewards/rollout_reward_func/mean": 0.3397909998893738, "rewards/rollout_reward_func/std": 1.1733063459396362, "sampling/importance_sampling_ratio/max": 1.4740691184997559, "sampling/importance_sampling_ratio/mean": 0.5324823260307312, "sampling/importance_sampling_ratio/min": 2.0865827121685498e-10, "sampling/sampling_logp_difference/max": 2.8328068256378174, "sampling/sampling_logp_difference/mean": 0.48776745796203613, "step": 1219, "step_time": 34.56223583596875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.405575707554817, "epoch": 0.0122, "grad_norm": 0.03805918246507645, "kl": 0.3906633607111871, "learning_rate": 7.999480992168033e-06, "loss": -0.0949, "step": 1220, "step_time": 14.150991645030444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.65625, "completions/mean_terminated_length": 5.849999904632568, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.3084873287007213, "epoch": 0.01221, "frac_reward_zero_std": 0.0, "grad_norm": 0.07096515595912933, "kl": 0.1772341439500451, "learning_rate": 7.99948011512072e-06, "loss": -0.1013, "num_tokens": 32880362.0, "reward": 0.33413437008857727, "reward_std": 1.2659690380096436, "rewards/rollout_reward_func/mean": 0.33413437008857727, "rewards/rollout_reward_func/std": 1.2659690380096436, "sampling/importance_sampling_ratio/max": 1.3146201372146606, "sampling/importance_sampling_ratio/mean": 0.5416266918182373, "sampling/importance_sampling_ratio/min": 2.5422948510822607e-06, "sampling/sampling_logp_difference/max": 1.9455937147140503, "sampling/sampling_logp_difference/mean": 0.38639235496520996, "step": 1221, "step_time": 33.470527952042175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.3122681798413396, "epoch": 0.01222, "grad_norm": 0.07620174437761307, "kl": 0.18052190146408975, "learning_rate": 7.999479237333053e-06, "loss": -0.1014, "step": 1222, "step_time": 14.80338156898506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.59375, "completions/mean_terminated_length": 5.695652484893799, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9611427336931229, "epoch": 0.01223, "frac_reward_zero_std": 0.0, "grad_norm": 0.03537490591406822, "kl": 0.29416032414883375, "learning_rate": 7.999478358805037e-06, "loss": -0.0856, "num_tokens": 32942843.0, "reward": 0.640546977519989, "reward_std": 1.2701762914657593, "rewards/rollout_reward_func/mean": 0.640546977519989, "rewards/rollout_reward_func/std": 1.2701764106750488, "sampling/importance_sampling_ratio/max": 1.4271756410598755, "sampling/importance_sampling_ratio/mean": 0.6136554479598999, "sampling/importance_sampling_ratio/min": 9.933282854035497e-07, "sampling/sampling_logp_difference/max": 2.0833115577697754, "sampling/sampling_logp_difference/mean": 0.32224613428115845, "step": 1223, "step_time": 32.34795314804069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9615449905395508, "epoch": 0.01224, "grad_norm": 0.03638250380754471, "kl": 0.2980034532956779, "learning_rate": 7.999477479536669e-06, "loss": -0.0856, "step": 1224, "step_time": 14.071079666988226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 10.625, "completions/mean_terminated_length": 5.882352828979492, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.856046184897423, "epoch": 0.01225, "frac_reward_zero_std": 0.0, "grad_norm": 0.06413161754608154, "kl": 0.13873926317319274, "learning_rate": 7.99947659952795e-06, "loss": -0.0838, "num_tokens": 33009408.0, "reward": -0.38465049862861633, "reward_std": 0.9232367873191833, "rewards/rollout_reward_func/mean": -0.38465049862861633, "rewards/rollout_reward_func/std": 0.9232367873191833, "sampling/importance_sampling_ratio/max": 1.1760292053222656, "sampling/importance_sampling_ratio/mean": 0.3783479630947113, "sampling/importance_sampling_ratio/min": 1.6041936760302633e-05, "sampling/sampling_logp_difference/max": 1.9846919775009155, "sampling/sampling_logp_difference/mean": 0.4018378257751465, "step": 1225, "step_time": 36.56946353899548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.853304907679558, "epoch": 0.01226, "grad_norm": 0.06482398509979248, "kl": 0.13868837617337704, "learning_rate": 7.999475718778881e-06, "loss": -0.0839, "step": 1226, "step_time": 14.30014424098772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 5.217391490936279, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9896733816713095, "epoch": 0.01227, "frac_reward_zero_std": 0.25, "grad_norm": 0.1791374832391739, "kl": 0.37515155505388975, "learning_rate": 7.999474837289462e-06, "loss": -0.0618, "num_tokens": 33068473.0, "reward": 0.4732644855976105, "reward_std": 1.1018158197402954, "rewards/rollout_reward_func/mean": 0.4732644855976105, "rewards/rollout_reward_func/std": 1.1018158197402954, "sampling/importance_sampling_ratio/max": 1.2010376453399658, "sampling/importance_sampling_ratio/mean": 0.6349478960037231, "sampling/importance_sampling_ratio/min": 3.261825668232632e-06, "sampling/sampling_logp_difference/max": 1.8997381925582886, "sampling/sampling_logp_difference/mean": 0.33265408873558044, "step": 1227, "step_time": 34.90071045298828 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.977799342945218, "epoch": 0.01228, "grad_norm": 0.09394548833370209, "kl": 0.37368569895625114, "learning_rate": 7.999473955059694e-06, "loss": -0.0623, "step": 1228, "step_time": 15.437030582892476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.46875, "completions/mean_terminated_length": 5.5217390060424805, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.2669097781181335, "epoch": 0.01229, "frac_reward_zero_std": 0.0, "grad_norm": 0.10281732678413391, "kl": 0.2190845343284309, "learning_rate": 7.999473072089576e-06, "loss": -0.062, "num_tokens": 33131798.0, "reward": 0.05616968870162964, "reward_std": 1.2603247165679932, "rewards/rollout_reward_func/mean": 0.05616968870162964, "rewards/rollout_reward_func/std": 1.2603245973587036, "sampling/importance_sampling_ratio/max": 1.2517259120941162, "sampling/importance_sampling_ratio/mean": 0.5409477949142456, "sampling/importance_sampling_ratio/min": 1.4539148196490714e-06, "sampling/sampling_logp_difference/max": 1.852141261100769, "sampling/sampling_logp_difference/mean": 0.34634676575660706, "step": 1229, "step_time": 38.93637799593853 }, { "clip_ratio/high_max": 0.0062500000931322575, "clip_ratio/high_mean": 0.0031250000465661287, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 2.26152740418911, "epoch": 0.0123, "grad_norm": 0.11985331028699875, "kl": 0.21955532673746347, "learning_rate": 7.999472188379108e-06, "loss": -0.0622, "step": 1230, "step_time": 17.8524733200029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.1018419349566102, "epoch": 0.01231, "frac_reward_zero_std": 1.0, "grad_norm": 0.00040371593786403537, "kl": 0.25542539544403553, "learning_rate": 7.999471303928292e-06, "loss": 0.0006, "num_tokens": 33166860.0, "reward": 1.1138956546783447, "reward_std": 1.1120424270629883, "rewards/rollout_reward_func/mean": 1.1138956546783447, "rewards/rollout_reward_func/std": 1.1120424270629883, "sampling/importance_sampling_ratio/max": 1.0574641227722168, "sampling/importance_sampling_ratio/mean": 1.0218840837478638, "sampling/importance_sampling_ratio/min": 1.0052605867385864, "sampling/sampling_logp_difference/max": 0.037688374519348145, "sampling/sampling_logp_difference/mean": 0.005498196929693222, "step": 1231, "step_time": 11.956343154015485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09812386333942413, "epoch": 0.01232, "grad_norm": 0.0003884999896399677, "kl": 0.2558121755719185, "learning_rate": 7.999470418737126e-06, "loss": 0.0006, "step": 1232, "step_time": 6.830436139949597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.15625, "completions/mean_terminated_length": 5.5714287757873535, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.4526395201683044, "epoch": 0.01233, "frac_reward_zero_std": 0.0, "grad_norm": 0.11972252279520035, "kl": 0.22758384980261326, "learning_rate": 7.999469532805612e-06, "loss": -0.0831, "num_tokens": 33230716.0, "reward": -0.01691402494907379, "reward_std": 1.1575590372085571, "rewards/rollout_reward_func/mean": -0.01691402494907379, "rewards/rollout_reward_func/std": 1.1575590372085571, "sampling/importance_sampling_ratio/max": 1.0832531452178955, "sampling/importance_sampling_ratio/mean": 0.49175333976745605, "sampling/importance_sampling_ratio/min": 5.155258804734331e-06, "sampling/sampling_logp_difference/max": 2.2200093269348145, "sampling/sampling_logp_difference/mean": 0.39153483510017395, "step": 1233, "step_time": 39.46203132602386 }, { "clip_ratio/high_max": 0.008928571827709675, "clip_ratio/high_mean": 0.004464285913854837, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "entropy": 2.4511147290468216, "epoch": 0.01234, "grad_norm": 0.08092327415943146, "kl": 0.22174823842942715, "learning_rate": 7.999468646133747e-06, "loss": -0.0834, "step": 1234, "step_time": 16.387977045058506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.03125, "completions/mean_terminated_length": 4.607142925262451, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2275980226695538, "epoch": 0.01235, "frac_reward_zero_std": 0.25, "grad_norm": 0.11439923942089081, "kl": 0.36016431264579296, "learning_rate": 7.999467758721536e-06, "loss": -0.0318, "num_tokens": 33273438.0, "reward": 0.7974803447723389, "reward_std": 1.2504276037216187, "rewards/rollout_reward_func/mean": 0.7974803447723389, "rewards/rollout_reward_func/std": 1.2504276037216187, "sampling/importance_sampling_ratio/max": 1.229146957397461, "sampling/importance_sampling_ratio/mean": 0.7875103950500488, "sampling/importance_sampling_ratio/min": 6.796704838052392e-06, "sampling/sampling_logp_difference/max": 2.1318769454956055, "sampling/sampling_logp_difference/mean": 0.24650396406650543, "step": 1235, "step_time": 24.129693632014096 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.2262222217395902, "epoch": 0.01236, "grad_norm": 0.08738066256046295, "kl": 0.3606251012533903, "learning_rate": 7.999466870568978e-06, "loss": -0.0317, "step": 1236, "step_time": 11.423037494008895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.25, "completions/mean_terminated_length": 4.533333778381348, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.7218894297257066, "epoch": 0.01237, "frac_reward_zero_std": 0.5, "grad_norm": 0.01986253634095192, "kl": 0.2716878168284893, "learning_rate": 7.999465981676069e-06, "loss": -0.023, "num_tokens": 33313616.0, "reward": 1.34153151512146, "reward_std": 1.07893967628479, "rewards/rollout_reward_func/mean": 1.34153151512146, "rewards/rollout_reward_func/std": 1.07893967628479, "sampling/importance_sampling_ratio/max": 1.3027549982070923, "sampling/importance_sampling_ratio/mean": 0.9070804119110107, "sampling/importance_sampling_ratio/min": 0.0001373698323732242, "sampling/sampling_logp_difference/max": 2.159423351287842, "sampling/sampling_logp_difference/mean": 0.1546270251274109, "step": 1237, "step_time": 24.59384482903988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7222760869190097, "epoch": 0.01238, "grad_norm": 0.020393459126353264, "kl": 0.27176718041300774, "learning_rate": 7.999465092042814e-06, "loss": -0.023, "step": 1238, "step_time": 12.736114182975143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.625, "completions/mean_terminated_length": 4.551723957061768, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3502742424607277, "epoch": 0.01239, "frac_reward_zero_std": 0.0, "grad_norm": 0.05591744929552078, "kl": 0.9742694143205881, "learning_rate": 7.999464201669212e-06, "loss": -0.0831, "num_tokens": 33355714.0, "reward": 1.442880630493164, "reward_std": 1.0238513946533203, "rewards/rollout_reward_func/mean": 1.442880630493164, "rewards/rollout_reward_func/std": 1.0238512754440308, "sampling/importance_sampling_ratio/max": 1.1297088861465454, "sampling/importance_sampling_ratio/mean": 0.7561454772949219, "sampling/importance_sampling_ratio/min": 4.773378581290899e-09, "sampling/sampling_logp_difference/max": 2.2115237712860107, "sampling/sampling_logp_difference/mean": 0.2680140733718872, "step": 1239, "step_time": 25.47199128998909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3558268938213587, "epoch": 0.0124, "grad_norm": 0.05511412397027016, "kl": 0.9559499248862267, "learning_rate": 7.999463310555263e-06, "loss": -0.0831, "step": 1240, "step_time": 13.129424227023264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.8125, "completions/mean_terminated_length": 7.153846263885498, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.201410859823227, "epoch": 0.01241, "frac_reward_zero_std": 0.25, "grad_norm": 0.06570523977279663, "kl": 0.17805433366447687, "learning_rate": 7.999462418700966e-06, "loss": -0.0556, "num_tokens": 33411287.0, "reward": 0.35216787457466125, "reward_std": 1.4111248254776, "rewards/rollout_reward_func/mean": 0.35216787457466125, "rewards/rollout_reward_func/std": 1.4111248254776, "sampling/importance_sampling_ratio/max": 1.037960171699524, "sampling/importance_sampling_ratio/mean": 0.5038840770721436, "sampling/importance_sampling_ratio/min": 9.728748409543186e-06, "sampling/sampling_logp_difference/max": 1.9440557956695557, "sampling/sampling_logp_difference/mean": 0.3267875015735626, "step": 1241, "step_time": 31.54049046401633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.2007770389318466, "epoch": 0.01242, "grad_norm": 0.06043918803334236, "kl": 0.17749504651874304, "learning_rate": 7.999461526106323e-06, "loss": -0.0557, "step": 1242, "step_time": 14.181351169972913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 10.34375, "completions/mean_terminated_length": 4.6875, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.831754833459854, "epoch": 0.01243, "frac_reward_zero_std": 0.0, "grad_norm": 0.06187840923666954, "kl": 0.23985432088375092, "learning_rate": 7.999460632771332e-06, "loss": -0.0983, "num_tokens": 33478041.0, "reward": 0.42211243510246277, "reward_std": 1.2242136001586914, "rewards/rollout_reward_func/mean": 0.42211243510246277, "rewards/rollout_reward_func/std": 1.2242133617401123, "sampling/importance_sampling_ratio/max": 1.074759840965271, "sampling/importance_sampling_ratio/mean": 0.40887951850891113, "sampling/importance_sampling_ratio/min": 2.4470750759064686e-06, "sampling/sampling_logp_difference/max": 1.840915560722351, "sampling/sampling_logp_difference/mean": 0.4305936098098755, "step": 1243, "step_time": 35.44425478603807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.837806284427643, "epoch": 0.01244, "grad_norm": 0.06364990770816803, "kl": 0.2224881835281849, "learning_rate": 7.999459738695996e-06, "loss": -0.0986, "step": 1244, "step_time": 13.889672229997814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.59375, "completions/mean_terminated_length": 5.239999771118164, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8794025429524481, "epoch": 0.01245, "frac_reward_zero_std": 0.0, "grad_norm": 0.12310747802257538, "kl": 0.38843640219420195, "learning_rate": 7.999458843880313e-06, "loss": -0.0632, "num_tokens": 33537703.0, "reward": -0.1696900576353073, "reward_std": 1.1733986139297485, "rewards/rollout_reward_func/mean": -0.1696900576353073, "rewards/rollout_reward_func/std": 1.1733986139297485, "sampling/importance_sampling_ratio/max": 1.2302647829055786, "sampling/importance_sampling_ratio/mean": 0.6309562921524048, "sampling/importance_sampling_ratio/min": 4.7270245886466e-06, "sampling/sampling_logp_difference/max": 2.11287784576416, "sampling/sampling_logp_difference/mean": 0.3389694094657898, "step": 1245, "step_time": 32.24719790706877 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "entropy": 1.8759087910875678, "epoch": 0.01246, "grad_norm": 0.11428000032901764, "kl": 0.386735406704247, "learning_rate": 7.999457948324285e-06, "loss": -0.0637, "step": 1246, "step_time": 14.49441214298713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.84375, "completions/mean_terminated_length": 4.392857551574707, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.193024542182684, "epoch": 0.01247, "frac_reward_zero_std": 0.25, "grad_norm": 0.0973619669675827, "kl": 0.5262286178767681, "learning_rate": 7.99945705202791e-06, "loss": -0.0474, "num_tokens": 33588738.0, "reward": 0.4341088533401489, "reward_std": 1.3146191835403442, "rewards/rollout_reward_func/mean": 0.4341088533401489, "rewards/rollout_reward_func/std": 1.3146191835403442, "sampling/importance_sampling_ratio/max": 1.357807993888855, "sampling/importance_sampling_ratio/mean": 0.7984275221824646, "sampling/importance_sampling_ratio/min": 6.051070158719085e-05, "sampling/sampling_logp_difference/max": 1.8334101438522339, "sampling/sampling_logp_difference/mean": 0.2297278493642807, "step": 1247, "step_time": 28.40407216004678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.199691716581583, "epoch": 0.01248, "grad_norm": 0.09756150096654892, "kl": 0.5286064967513084, "learning_rate": 7.999456154991191e-06, "loss": -0.0474, "step": 1248, "step_time": 14.87778032006463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 10.6875, "completions/mean_terminated_length": 4.6666669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.683401547372341, "epoch": 0.01249, "frac_reward_zero_std": 0.0, "grad_norm": 0.27465301752090454, "kl": 0.27011545072309673, "learning_rate": 7.999455257214127e-06, "loss": -0.0875, "num_tokens": 33654906.0, "reward": -0.1782599538564682, "reward_std": 1.0542043447494507, "rewards/rollout_reward_func/mean": -0.1782599538564682, "rewards/rollout_reward_func/std": 1.0542043447494507, "sampling/importance_sampling_ratio/max": 1.2685073614120483, "sampling/importance_sampling_ratio/mean": 0.3916807770729065, "sampling/importance_sampling_ratio/min": 1.0584684240200204e-09, "sampling/sampling_logp_difference/max": 2.2808637619018555, "sampling/sampling_logp_difference/mean": 0.46658051013946533, "step": 1249, "step_time": 33.91040635597892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.6709285639226437, "epoch": 0.0125, "grad_norm": 0.2184899002313614, "kl": 0.2624382085632533, "learning_rate": 7.999454358696716e-06, "loss": -0.0889, "step": 1250, "step_time": 13.828499347990146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 5.428571701049805, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6823127940297127, "epoch": 0.01251, "frac_reward_zero_std": 0.5, "grad_norm": 0.035286419093608856, "kl": 0.22679029777646065, "learning_rate": 7.999453459438961e-06, "loss": -0.0447, "num_tokens": 33704035.0, "reward": 1.2148637771606445, "reward_std": 1.2402888536453247, "rewards/rollout_reward_func/mean": 1.2148637771606445, "rewards/rollout_reward_func/std": 1.2402888536453247, "sampling/importance_sampling_ratio/max": 1.0539436340332031, "sampling/importance_sampling_ratio/mean": 0.7443689107894897, "sampling/importance_sampling_ratio/min": 1.8706614355323836e-06, "sampling/sampling_logp_difference/max": 1.981218695640564, "sampling/sampling_logp_difference/mean": 0.2866968810558319, "step": 1251, "step_time": 26.813795748981647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6795699205249548, "epoch": 0.01252, "grad_norm": 0.03593314811587334, "kl": 0.22609956376254559, "learning_rate": 7.999452559440863e-06, "loss": -0.0447, "step": 1252, "step_time": 13.407854192977538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 5.6666669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.1949944607913494, "epoch": 0.01253, "frac_reward_zero_std": 0.0, "grad_norm": 0.06054797023534775, "kl": 0.8118072636425495, "learning_rate": 7.999451658702418e-06, "loss": -0.0849, "num_tokens": 33753394.0, "reward": 0.7111312747001648, "reward_std": 1.2867761850357056, "rewards/rollout_reward_func/mean": 0.7111312747001648, "rewards/rollout_reward_func/std": 1.2867761850357056, "sampling/importance_sampling_ratio/max": 1.115060567855835, "sampling/importance_sampling_ratio/mean": 0.5419345498085022, "sampling/importance_sampling_ratio/min": 5.238187554823526e-07, "sampling/sampling_logp_difference/max": 1.8855193853378296, "sampling/sampling_logp_difference/mean": 0.39169424772262573, "step": 1253, "step_time": 28.412965532043017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.1925269663333893, "epoch": 0.01254, "grad_norm": 0.06046074628829956, "kl": 0.8080395152792335, "learning_rate": 7.999450757223633e-06, "loss": -0.085, "step": 1254, "step_time": 13.401990104990546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.65625, "completions/mean_terminated_length": 4.9259257316589355, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1711821057833731, "epoch": 0.01255, "frac_reward_zero_std": 0.0, "grad_norm": 0.04817916452884674, "kl": 0.3161987718194723, "learning_rate": 7.9994498550045e-06, "loss": -0.074, "num_tokens": 33815412.0, "reward": 0.8374262452125549, "reward_std": 1.2007629871368408, "rewards/rollout_reward_func/mean": 0.8374262452125549, "rewards/rollout_reward_func/std": 1.2007629871368408, "sampling/importance_sampling_ratio/max": 1.176859736442566, "sampling/importance_sampling_ratio/mean": 0.7541985511779785, "sampling/importance_sampling_ratio/min": 4.678602749663696e-07, "sampling/sampling_logp_difference/max": 2.0222668647766113, "sampling/sampling_logp_difference/mean": 0.3235209584236145, "step": 1255, "step_time": 31.839396447030595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005681818351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "entropy": 1.1729874298907816, "epoch": 0.01256, "grad_norm": 0.040058258920907974, "kl": 0.32549692317843437, "learning_rate": 7.999448952045025e-06, "loss": -0.0741, "step": 1256, "step_time": 15.132388004043605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.28125, "completions/mean_terminated_length": 4.481481552124023, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.161017849110067, "epoch": 0.01257, "frac_reward_zero_std": 0.25, "grad_norm": 0.0727061852812767, "kl": 0.28656249679625034, "learning_rate": 7.999448048345206e-06, "loss": -0.0531, "num_tokens": 33869217.0, "reward": 1.1246663331985474, "reward_std": 1.0616904497146606, "rewards/rollout_reward_func/mean": 1.1246663331985474, "rewards/rollout_reward_func/std": 1.0616904497146606, "sampling/importance_sampling_ratio/max": 1.1603736877441406, "sampling/importance_sampling_ratio/mean": 0.8163602352142334, "sampling/importance_sampling_ratio/min": 8.267756493296474e-05, "sampling/sampling_logp_difference/max": 1.8208339214324951, "sampling/sampling_logp_difference/mean": 0.18051843345165253, "step": 1257, "step_time": 33.01269569803844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1618927624076605, "epoch": 0.01258, "grad_norm": 0.08214057981967926, "kl": 0.28134836815297604, "learning_rate": 7.999447143905043e-06, "loss": -0.0533, "step": 1258, "step_time": 12.560381410061382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 10.375, "completions/mean_terminated_length": 6.526315689086914, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.875183880329132, "epoch": 0.01259, "frac_reward_zero_std": 0.0, "grad_norm": 0.02480260655283928, "kl": 0.13070201314985752, "learning_rate": 7.99944623872454e-06, "loss": -0.0757, "num_tokens": 33931119.0, "reward": -0.23984651267528534, "reward_std": 1.098369836807251, "rewards/rollout_reward_func/mean": -0.23984651267528534, "rewards/rollout_reward_func/std": 1.098369836807251, "sampling/importance_sampling_ratio/max": 1.265248417854309, "sampling/importance_sampling_ratio/mean": 0.41649097204208374, "sampling/importance_sampling_ratio/min": 2.8398046580946357e-08, "sampling/sampling_logp_difference/max": 2.1703312397003174, "sampling/sampling_logp_difference/mean": 0.4755210876464844, "step": 1259, "step_time": 31.592513729032362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 2.8808299154043198, "epoch": 0.0126, "grad_norm": 0.023076651617884636, "kl": 0.13658139249309897, "learning_rate": 7.999445332803692e-06, "loss": -0.0757, "step": 1260, "step_time": 13.657891903974814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.375, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8134731762111187, "epoch": 0.01261, "frac_reward_zero_std": 0.0, "grad_norm": 0.06027143821120262, "kl": 0.5045039877295494, "learning_rate": 7.999444426142501e-06, "loss": -0.086, "num_tokens": 33987204.0, "reward": 1.0198638439178467, "reward_std": 1.1717458963394165, "rewards/rollout_reward_func/mean": 1.0198638439178467, "rewards/rollout_reward_func/std": 1.171745777130127, "sampling/importance_sampling_ratio/max": 1.1473509073257446, "sampling/importance_sampling_ratio/mean": 0.7520831823348999, "sampling/importance_sampling_ratio/min": 6.177929279260752e-09, "sampling/sampling_logp_difference/max": 2.6424660682678223, "sampling/sampling_logp_difference/mean": 0.3971211016178131, "step": 1261, "step_time": 33.386750927980756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8149005342274904, "epoch": 0.01262, "grad_norm": 0.05876649543642998, "kl": 0.49827569257467985, "learning_rate": 7.999443518740967e-06, "loss": -0.086, "step": 1262, "step_time": 15.958516823040554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 8.1875, "completions/mean_terminated_length": 4.636363983154297, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.224584735929966, "epoch": 0.01263, "frac_reward_zero_std": 0.0, "grad_norm": 0.06453751027584076, "kl": 0.1882626749575138, "learning_rate": 7.999442610599094e-06, "loss": -0.0917, "num_tokens": 34037341.0, "reward": 0.9629844427108765, "reward_std": 1.2932151556015015, "rewards/rollout_reward_func/mean": 0.9629844427108765, "rewards/rollout_reward_func/std": 1.2932151556015015, "sampling/importance_sampling_ratio/max": 1.0551341772079468, "sampling/importance_sampling_ratio/mean": 0.6399469375610352, "sampling/importance_sampling_ratio/min": 6.384164407791104e-06, "sampling/sampling_logp_difference/max": 2.5296154022216797, "sampling/sampling_logp_difference/mean": 0.3663385510444641, "step": 1263, "step_time": 25.921399649058003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.2285681776702404, "epoch": 0.01264, "grad_norm": 0.06429603695869446, "kl": 0.18930040067061782, "learning_rate": 7.999441701716877e-06, "loss": -0.0918, "step": 1264, "step_time": 11.758677794947289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 7.25, "completions/mean_terminated_length": 4.333333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3379397839307785, "epoch": 0.01265, "frac_reward_zero_std": 0.25, "grad_norm": 0.10910235345363617, "kl": 0.2610350251197815, "learning_rate": 7.999440792094319e-06, "loss": -0.0694, "num_tokens": 34085504.0, "reward": 1.0050463676452637, "reward_std": 1.2971872091293335, "rewards/rollout_reward_func/mean": 1.0050463676452637, "rewards/rollout_reward_func/std": 1.297187089920044, "sampling/importance_sampling_ratio/max": 1.4992347955703735, "sampling/importance_sampling_ratio/mean": 0.7239359617233276, "sampling/importance_sampling_ratio/min": 0.0002954558003693819, "sampling/sampling_logp_difference/max": 2.053687572479248, "sampling/sampling_logp_difference/mean": 0.24103961884975433, "step": 1265, "step_time": 29.24319838898373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3402664931491017, "epoch": 0.01266, "grad_norm": 0.10743174701929092, "kl": 0.26469955313950777, "learning_rate": 7.999439881731418e-06, "loss": -0.0695, "step": 1266, "step_time": 13.36706261706422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.3125, "completions/mean_terminated_length": 4.285714149475098, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9734644033014774, "epoch": 0.01267, "frac_reward_zero_std": 0.25, "grad_norm": 0.15728577971458435, "kl": 0.17729719262570143, "learning_rate": 7.999438970628177e-06, "loss": -0.0718, "num_tokens": 34144404.0, "reward": 0.4463287889957428, "reward_std": 1.2628055810928345, "rewards/rollout_reward_func/mean": 0.4463287889957428, "rewards/rollout_reward_func/std": 1.2628055810928345, "sampling/importance_sampling_ratio/max": 1.3676860332489014, "sampling/importance_sampling_ratio/mean": 0.6563071012496948, "sampling/importance_sampling_ratio/min": 9.69021948549198e-06, "sampling/sampling_logp_difference/max": 2.058753252029419, "sampling/sampling_logp_difference/mean": 0.3490176200866699, "step": 1267, "step_time": 37.880150072072865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0024038462433964014, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024038462433964014, "entropy": 1.9792483318597078, "epoch": 0.01268, "grad_norm": 0.1217421442270279, "kl": 0.18216516263782978, "learning_rate": 7.999438058784595e-06, "loss": -0.0729, "step": 1268, "step_time": 15.716881898028078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 5.217391490936279, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.0770973535254598, "epoch": 0.01269, "frac_reward_zero_std": 0.25, "grad_norm": 0.04159112647175789, "kl": 0.23120921663939953, "learning_rate": 7.999437146200673e-06, "loss": -0.0626, "num_tokens": 34197930.0, "reward": 0.42789754271507263, "reward_std": 1.3473310470581055, "rewards/rollout_reward_func/mean": 0.42789754271507263, "rewards/rollout_reward_func/std": 1.3473310470581055, "sampling/importance_sampling_ratio/max": 1.1858787536621094, "sampling/importance_sampling_ratio/mean": 0.596916139125824, "sampling/importance_sampling_ratio/min": 1.940701466196515e-08, "sampling/sampling_logp_difference/max": 2.14711856842041, "sampling/sampling_logp_difference/mean": 0.41571563482284546, "step": 1269, "step_time": 29.857597532944055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0029761905316263437, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0029761905316263437, "entropy": 2.0738588608801365, "epoch": 0.0127, "grad_norm": 0.03315673768520355, "kl": 0.24546297173947096, "learning_rate": 7.99943623287641e-06, "loss": -0.0626, "step": 1270, "step_time": 13.410930808953708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 7.53125, "completions/mean_terminated_length": 4.708333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6522359577938914, "epoch": 0.01271, "frac_reward_zero_std": 0.0, "grad_norm": 0.13242360949516296, "kl": 0.5855775363743305, "learning_rate": 7.999435318811804e-06, "loss": -0.0638, "num_tokens": 34252797.0, "reward": 0.32886752486228943, "reward_std": 1.1020779609680176, "rewards/rollout_reward_func/mean": 0.32886752486228943, "rewards/rollout_reward_func/std": 1.102077841758728, "sampling/importance_sampling_ratio/max": 1.1139792203903198, "sampling/importance_sampling_ratio/mean": 0.6818426251411438, "sampling/importance_sampling_ratio/min": 3.060844392166473e-05, "sampling/sampling_logp_difference/max": 1.800283670425415, "sampling/sampling_logp_difference/mean": 0.2642180621623993, "step": 1271, "step_time": 30.8895097009663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6549618593417108, "epoch": 0.01272, "grad_norm": 0.1157357320189476, "kl": 0.5180480554699898, "learning_rate": 7.999434404006862e-06, "loss": -0.0644, "step": 1272, "step_time": 13.558257471973775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.3125, "completions/mean_terminated_length": 5.300000190734863, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.8922039985191077, "epoch": 0.01273, "frac_reward_zero_std": 0.0, "grad_norm": 0.06746836751699448, "kl": 0.3735696552321315, "learning_rate": 7.999433488461578e-06, "loss": -0.0775, "num_tokens": 34312100.0, "reward": 0.08649793267250061, "reward_std": 1.2778316736221313, "rewards/rollout_reward_func/mean": 0.08649793267250061, "rewards/rollout_reward_func/std": 1.2778316736221313, "sampling/importance_sampling_ratio/max": 1.2459100484848022, "sampling/importance_sampling_ratio/mean": 0.4866621494293213, "sampling/importance_sampling_ratio/min": 2.170263613265888e-08, "sampling/sampling_logp_difference/max": 2.1806631088256836, "sampling/sampling_logp_difference/mean": 0.5251799821853638, "step": 1273, "step_time": 31.199814082036028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.898905123816803, "epoch": 0.01274, "grad_norm": 0.06667637079954147, "kl": 0.36988169234246016, "learning_rate": 7.999432572175954e-06, "loss": -0.0775, "step": 1274, "step_time": 13.79843510495266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.09375, "completions/mean_terminated_length": 4.599999904632568, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4157032519578934, "epoch": 0.01275, "frac_reward_zero_std": 0.25, "grad_norm": 0.11145591735839844, "kl": 0.266749638132751, "learning_rate": 7.999431655149993e-06, "loss": -0.0375, "num_tokens": 34371040.0, "reward": 0.20446108281612396, "reward_std": 1.1633648872375488, "rewards/rollout_reward_func/mean": 0.20446108281612396, "rewards/rollout_reward_func/std": 1.1633647680282593, "sampling/importance_sampling_ratio/max": 1.249152660369873, "sampling/importance_sampling_ratio/mean": 0.6988414525985718, "sampling/importance_sampling_ratio/min": 2.6590236302581616e-05, "sampling/sampling_logp_difference/max": 1.770878791809082, "sampling/sampling_logp_difference/mean": 0.25433480739593506, "step": 1275, "step_time": 34.17064778000349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4246928617358208, "epoch": 0.01276, "grad_norm": 0.11903252452611923, "kl": 0.2673807246610522, "learning_rate": 7.99943073738369e-06, "loss": -0.0377, "step": 1276, "step_time": 15.65734947900637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.4375, "completions/mean_terminated_length": 6.692307949066162, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.459398217499256, "epoch": 0.01277, "frac_reward_zero_std": 0.0, "grad_norm": 0.06394411623477936, "kl": 0.29170612432062626, "learning_rate": 7.99942981887705e-06, "loss": -0.0816, "num_tokens": 34424232.0, "reward": -0.31072747707366943, "reward_std": 1.077170968055725, "rewards/rollout_reward_func/mean": -0.31072747707366943, "rewards/rollout_reward_func/std": 1.077170968055725, "sampling/importance_sampling_ratio/max": 1.1281728744506836, "sampling/importance_sampling_ratio/mean": 0.5673007965087891, "sampling/importance_sampling_ratio/min": 3.7462014006450772e-06, "sampling/sampling_logp_difference/max": 2.064666271209717, "sampling/sampling_logp_difference/mean": 0.43377190828323364, "step": 1277, "step_time": 30.01232155202888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.477618372067809, "epoch": 0.01278, "grad_norm": 0.08751263469457626, "kl": 0.29148282017558813, "learning_rate": 7.99942889963007e-06, "loss": -0.0816, "step": 1278, "step_time": 13.47002629499184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.59375, "completions/mean_terminated_length": 6.519999980926514, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.991370350122452, "epoch": 0.01279, "frac_reward_zero_std": 0.0, "grad_norm": 0.15746371448040009, "kl": 0.1832829685881734, "learning_rate": 7.99942797964275e-06, "loss": -0.072, "num_tokens": 34472502.0, "reward": 0.33168116211891174, "reward_std": 1.3736768960952759, "rewards/rollout_reward_func/mean": 0.33168116211891174, "rewards/rollout_reward_func/std": 1.3736767768859863, "sampling/importance_sampling_ratio/max": 1.0504050254821777, "sampling/importance_sampling_ratio/mean": 0.4480602741241455, "sampling/importance_sampling_ratio/min": 1.831838716270795e-07, "sampling/sampling_logp_difference/max": 2.245283603668213, "sampling/sampling_logp_difference/mean": 0.5184056758880615, "step": 1279, "step_time": 26.009934435016476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.0064519941806793, "epoch": 0.0128, "grad_norm": 0.1436946839094162, "kl": 0.18834844324737787, "learning_rate": 7.999427058915096e-06, "loss": -0.072, "step": 1280, "step_time": 12.463210244954098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.28125, "completions/mean_terminated_length": 4.839999675750732, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.0004004426300526, "epoch": 0.01281, "frac_reward_zero_std": 0.0, "grad_norm": 0.12039213627576828, "kl": 0.22058424353599548, "learning_rate": 7.9994261374471e-06, "loss": -0.0871, "num_tokens": 34528287.0, "reward": 0.7432082891464233, "reward_std": 1.288697600364685, "rewards/rollout_reward_func/mean": 0.7432082891464233, "rewards/rollout_reward_func/std": 1.2886974811553955, "sampling/importance_sampling_ratio/max": 1.4487892389297485, "sampling/importance_sampling_ratio/mean": 0.7546964883804321, "sampling/importance_sampling_ratio/min": 7.818599101483414e-07, "sampling/sampling_logp_difference/max": 2.0722947120666504, "sampling/sampling_logp_difference/mean": 0.3892393410205841, "step": 1281, "step_time": 28.279727847053437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9997621541842818, "epoch": 0.01282, "grad_norm": 0.10358845442533493, "kl": 0.22181848995387554, "learning_rate": 7.99942521523877e-06, "loss": -0.0873, "step": 1282, "step_time": 14.154434655007208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.125, "completions/mean_terminated_length": 5.481481552124023, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8599033318459988, "epoch": 0.01283, "frac_reward_zero_std": 0.25, "grad_norm": 0.04865977168083191, "kl": 0.2696620039641857, "learning_rate": 7.999424292290098e-06, "loss": -0.0691, "num_tokens": 34571763.0, "reward": 0.7827470302581787, "reward_std": 1.3831607103347778, "rewards/rollout_reward_func/mean": 0.7827470302581787, "rewards/rollout_reward_func/std": 1.3831607103347778, "sampling/importance_sampling_ratio/max": 1.036712408065796, "sampling/importance_sampling_ratio/mean": 0.6895924806594849, "sampling/importance_sampling_ratio/min": 6.845878175454345e-08, "sampling/sampling_logp_difference/max": 2.1268749237060547, "sampling/sampling_logp_difference/mean": 0.34483271837234497, "step": 1283, "step_time": 28.458551561052445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.853410616517067, "epoch": 0.01284, "grad_norm": 0.03186609596014023, "kl": 0.26860002893954515, "learning_rate": 7.999423368601091e-06, "loss": -0.0693, "step": 1284, "step_time": 12.101553412067005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1691274237819016, "epoch": 0.01285, "frac_reward_zero_std": 0.0, "grad_norm": 0.027176646515727043, "kl": 0.30036937072873116, "learning_rate": 7.999422444171747e-06, "loss": -0.0458, "num_tokens": 34634624.0, "reward": 0.39467620849609375, "reward_std": 1.2622171640396118, "rewards/rollout_reward_func/mean": 0.39467620849609375, "rewards/rollout_reward_func/std": 1.2622170448303223, "sampling/importance_sampling_ratio/max": 1.098406434059143, "sampling/importance_sampling_ratio/mean": 0.804274320602417, "sampling/importance_sampling_ratio/min": 1.3156576414985466e-06, "sampling/sampling_logp_difference/max": 1.8343459367752075, "sampling/sampling_logp_difference/mean": 0.23551121354103088, "step": 1285, "step_time": 30.59893057597219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1645645620301366, "epoch": 0.01286, "grad_norm": 0.02244681678712368, "kl": 0.304609302431345, "learning_rate": 7.999421519002065e-06, "loss": -0.0458, "step": 1286, "step_time": 14.629417565010954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.28125, "completions/mean_terminated_length": 5.269230842590332, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7285978337749839, "epoch": 0.01287, "frac_reward_zero_std": 0.25, "grad_norm": 0.017430275678634644, "kl": 0.2703588306903839, "learning_rate": 7.999420593092048e-06, "loss": -0.0801, "num_tokens": 34689754.0, "reward": 0.9131816029548645, "reward_std": 1.2755860090255737, "rewards/rollout_reward_func/mean": 0.9131816029548645, "rewards/rollout_reward_func/std": 1.2755858898162842, "sampling/importance_sampling_ratio/max": 1.0215033292770386, "sampling/importance_sampling_ratio/mean": 0.6687299013137817, "sampling/importance_sampling_ratio/min": 1.560726923344191e-05, "sampling/sampling_logp_difference/max": 1.7993319034576416, "sampling/sampling_logp_difference/mean": 0.3051065504550934, "step": 1287, "step_time": 34.22017127100844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7243214277550578, "epoch": 0.01288, "grad_norm": 0.01659313216805458, "kl": 0.2680508140474558, "learning_rate": 7.999419666441693e-06, "loss": -0.0801, "step": 1288, "step_time": 15.295555882010376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.28125, "completions/mean_terminated_length": 5.269230842590332, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.0879843086004257, "epoch": 0.01289, "frac_reward_zero_std": 0.0, "grad_norm": 0.09483874589204788, "kl": 0.3644894724711776, "learning_rate": 7.999418739051001e-06, "loss": -0.0836, "num_tokens": 34750530.0, "reward": 0.29206836223602295, "reward_std": 1.2877846956253052, "rewards/rollout_reward_func/mean": 0.29206836223602295, "rewards/rollout_reward_func/std": 1.2877846956253052, "sampling/importance_sampling_ratio/max": 1.1300725936889648, "sampling/importance_sampling_ratio/mean": 0.6388275027275085, "sampling/importance_sampling_ratio/min": 3.221190354452119e-06, "sampling/sampling_logp_difference/max": 1.9335885047912598, "sampling/sampling_logp_difference/mean": 0.37598341703414917, "step": 1289, "step_time": 33.05160501899081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.087123066186905, "epoch": 0.0129, "grad_norm": 0.09745527803897858, "kl": 0.353867094963789, "learning_rate": 7.999417810919975e-06, "loss": -0.0837, "step": 1290, "step_time": 14.292089325957932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.25, "completions/mean_terminated_length": 5.230769634246826, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8124463707208633, "epoch": 0.01291, "frac_reward_zero_std": 0.25, "grad_norm": 0.03713119029998779, "kl": 0.548107547685504, "learning_rate": 7.999416882048612e-06, "loss": -0.0543, "num_tokens": 34805335.0, "reward": -0.23147423565387726, "reward_std": 0.9037464261054993, "rewards/rollout_reward_func/mean": -0.23147423565387726, "rewards/rollout_reward_func/std": 0.9037463665008545, "sampling/importance_sampling_ratio/max": 1.0917093753814697, "sampling/importance_sampling_ratio/mean": 0.643724262714386, "sampling/importance_sampling_ratio/min": 0.00030880284612067044, "sampling/sampling_logp_difference/max": 2.015099287033081, "sampling/sampling_logp_difference/mean": 0.2941312789916992, "step": 1291, "step_time": 30.237457868031925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8039220422506332, "epoch": 0.01292, "grad_norm": 0.03745722398161888, "kl": 0.5455806292593479, "learning_rate": 7.999415952436915e-06, "loss": -0.0543, "step": 1292, "step_time": 13.152493539033458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 6.84375, "completions/mean_terminated_length": 5.148148059844971, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8816822841763496, "epoch": 0.01293, "frac_reward_zero_std": 0.0, "grad_norm": 0.04449780657887459, "kl": 0.5185564961284399, "learning_rate": 7.99941502208488e-06, "loss": -0.0807, "num_tokens": 34849706.0, "reward": -0.00010585784912109375, "reward_std": 1.255469799041748, "rewards/rollout_reward_func/mean": -0.00010585784912109375, "rewards/rollout_reward_func/std": 1.255469799041748, "sampling/importance_sampling_ratio/max": 1.106215476989746, "sampling/importance_sampling_ratio/mean": 0.6182364821434021, "sampling/importance_sampling_ratio/min": 1.191299588754191e-06, "sampling/sampling_logp_difference/max": 2.167250394821167, "sampling/sampling_logp_difference/mean": 0.3965071439743042, "step": 1293, "step_time": 23.399673032021383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8775014821439981, "epoch": 0.01294, "grad_norm": 0.04363902285695076, "kl": 0.49477208964526653, "learning_rate": 7.999414090992513e-06, "loss": -0.0807, "step": 1294, "step_time": 12.017904750013258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 5.34375, "completions/mean_terminated_length": 4.241379261016846, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9158597020432353, "epoch": 0.01295, "frac_reward_zero_std": 0.5, "grad_norm": 0.026479577645659447, "kl": 0.24212523736059666, "learning_rate": 7.999413159159809e-06, "loss": -0.0348, "num_tokens": 34899389.0, "reward": 0.6120079755783081, "reward_std": 1.3138338327407837, "rewards/rollout_reward_func/mean": 0.6120079755783081, "rewards/rollout_reward_func/std": 1.3138338327407837, "sampling/importance_sampling_ratio/max": 1.1445577144622803, "sampling/importance_sampling_ratio/mean": 0.8607210516929626, "sampling/importance_sampling_ratio/min": 6.478330192294379e-07, "sampling/sampling_logp_difference/max": 1.9200129508972168, "sampling/sampling_logp_difference/mean": 0.14883030951023102, "step": 1295, "step_time": 30.004147087951424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9134303014725447, "epoch": 0.01296, "grad_norm": 0.027130737900733948, "kl": 0.24162904173135757, "learning_rate": 7.999412226586771e-06, "loss": -0.0348, "step": 1296, "step_time": 14.159787527023582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.5625, "completions/mean_terminated_length": 4.75, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.0382063034921885, "epoch": 0.01297, "frac_reward_zero_std": 0.0, "grad_norm": 0.02559228613972664, "kl": 0.2597943600267172, "learning_rate": 7.999411293273398e-06, "loss": -0.0727, "num_tokens": 34953400.0, "reward": 0.15633398294448853, "reward_std": 1.3052767515182495, "rewards/rollout_reward_func/mean": 0.15633398294448853, "rewards/rollout_reward_func/std": 1.30527663230896, "sampling/importance_sampling_ratio/max": 1.208052635192871, "sampling/importance_sampling_ratio/mean": 0.6847812533378601, "sampling/importance_sampling_ratio/min": 7.460257478442145e-08, "sampling/sampling_logp_difference/max": 2.145786762237549, "sampling/sampling_logp_difference/mean": 0.41213804483413696, "step": 1297, "step_time": 28.73014132096432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0412971768528223, "epoch": 0.01298, "grad_norm": 0.02485445886850357, "kl": 0.2594554778188467, "learning_rate": 7.999410359219692e-06, "loss": -0.0728, "step": 1298, "step_time": 13.513843054039171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 7.34375, "completions/mean_terminated_length": 4.458333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.950750470161438, "epoch": 0.01299, "frac_reward_zero_std": 0.25, "grad_norm": 0.05972818285226822, "kl": 0.598530350252986, "learning_rate": 7.999409424425651e-06, "loss": -0.0318, "num_tokens": 35004686.0, "reward": -0.3595668375492096, "reward_std": 0.9897708892822266, "rewards/rollout_reward_func/mean": -0.3595668375492096, "rewards/rollout_reward_func/std": 0.9897708892822266, "sampling/importance_sampling_ratio/max": 1.3477847576141357, "sampling/importance_sampling_ratio/mean": 0.6900944113731384, "sampling/importance_sampling_ratio/min": 3.9389556150126737e-07, "sampling/sampling_logp_difference/max": 2.8219146728515625, "sampling/sampling_logp_difference/mean": 0.30933713912963867, "step": 1299, "step_time": 26.592324461991666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.951283022761345, "epoch": 0.013, "grad_norm": 0.0630858838558197, "kl": 0.6031702216714621, "learning_rate": 7.999408488891278e-06, "loss": -0.0316, "step": 1300, "step_time": 12.474236671027029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.1875, "completions/mean_terminated_length": 4.71999979019165, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.756891238503158, "epoch": 0.01301, "frac_reward_zero_std": 0.25, "grad_norm": 0.03133026510477066, "kl": 0.24357756786048412, "learning_rate": 7.99940755261657e-06, "loss": -0.0684, "num_tokens": 35060269.0, "reward": 0.6369045972824097, "reward_std": 1.221934199333191, "rewards/rollout_reward_func/mean": 0.6369045972824097, "rewards/rollout_reward_func/std": 1.221934199333191, "sampling/importance_sampling_ratio/max": 1.0564407110214233, "sampling/importance_sampling_ratio/mean": 0.6608139872550964, "sampling/importance_sampling_ratio/min": 3.826684860541718e-06, "sampling/sampling_logp_difference/max": 1.9891653060913086, "sampling/sampling_logp_difference/mean": 0.3098049759864807, "step": 1301, "step_time": 31.72956701304065 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "entropy": 1.7577198171056807, "epoch": 0.01302, "grad_norm": 0.022164184600114822, "kl": 0.23376738652586937, "learning_rate": 7.99940661560153e-06, "loss": -0.0685, "step": 1302, "step_time": 13.711886477976805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 4.159999847412109, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5662922412157059, "epoch": 0.01303, "frac_reward_zero_std": 0.5, "grad_norm": 0.0278331246227026, "kl": 0.23063122481107712, "learning_rate": 7.999405677846155e-06, "loss": -0.051, "num_tokens": 35111644.0, "reward": 0.5525715947151184, "reward_std": 1.045708417892456, "rewards/rollout_reward_func/mean": 0.5525715947151184, "rewards/rollout_reward_func/std": 1.0457082986831665, "sampling/importance_sampling_ratio/max": 1.0505012273788452, "sampling/importance_sampling_ratio/mean": 0.7659586668014526, "sampling/importance_sampling_ratio/min": 2.7877464958692144e-07, "sampling/sampling_logp_difference/max": 2.040635108947754, "sampling/sampling_logp_difference/mean": 0.24117684364318848, "step": 1303, "step_time": 34.89302453401615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.568587590008974, "epoch": 0.01304, "grad_norm": 0.026885930448770523, "kl": 0.22687590681016445, "learning_rate": 7.99940473935045e-06, "loss": -0.0511, "step": 1304, "step_time": 15.99599347601179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 4.90625, "completions/mean_terminated_length": 4.548387050628662, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.7809076448902488, "epoch": 0.01305, "frac_reward_zero_std": 0.25, "grad_norm": 0.022469593212008476, "kl": 0.40753629617393017, "learning_rate": 7.99940380011441e-06, "loss": -0.0475, "num_tokens": 35157909.0, "reward": 1.5122761726379395, "reward_std": 0.9112280607223511, "rewards/rollout_reward_func/mean": 1.5122761726379395, "rewards/rollout_reward_func/std": 0.9112280607223511, "sampling/importance_sampling_ratio/max": 1.0641679763793945, "sampling/importance_sampling_ratio/mean": 0.9023028016090393, "sampling/importance_sampling_ratio/min": 1.1033182090613991e-05, "sampling/sampling_logp_difference/max": 1.9260108470916748, "sampling/sampling_logp_difference/mean": 0.19367213547229767, "step": 1305, "step_time": 25.211048828059575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.77676134230569, "epoch": 0.01306, "grad_norm": 0.023895492777228355, "kl": 0.4141358844935894, "learning_rate": 7.999402860138038e-06, "loss": -0.0475, "step": 1306, "step_time": 13.937468661053572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.4375, "completions/mean_terminated_length": 4.583333492279053, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6032256931066513, "epoch": 0.01307, "frac_reward_zero_std": 0.0, "grad_norm": 0.05411228910088539, "kl": 0.2675834712572396, "learning_rate": 7.999401919421334e-06, "loss": -0.0424, "num_tokens": 35214517.0, "reward": 0.17203372716903687, "reward_std": 1.1162285804748535, "rewards/rollout_reward_func/mean": 0.17203372716903687, "rewards/rollout_reward_func/std": 1.1162285804748535, "sampling/importance_sampling_ratio/max": 1.2763704061508179, "sampling/importance_sampling_ratio/mean": 0.7061414122581482, "sampling/importance_sampling_ratio/min": 4.238295787217794e-06, "sampling/sampling_logp_difference/max": 1.9677259922027588, "sampling/sampling_logp_difference/mean": 0.306962788105011, "step": 1307, "step_time": 30.10978935394087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.600881108082831, "epoch": 0.01308, "grad_norm": 0.05252743139863014, "kl": 0.26155589148402214, "learning_rate": 7.999400977964298e-06, "loss": -0.0425, "step": 1308, "step_time": 14.367112788982922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.6875, "completions/mean_terminated_length": 4.962963104248047, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3550606071949005, "epoch": 0.01309, "frac_reward_zero_std": 0.0, "grad_norm": 0.026739908382296562, "kl": 0.3641164265573025, "learning_rate": 7.999400035766932e-06, "loss": -0.0801, "num_tokens": 35267847.0, "reward": 0.3402201533317566, "reward_std": 1.1977590322494507, "rewards/rollout_reward_func/mean": 0.3402201533317566, "rewards/rollout_reward_func/std": 1.1977590322494507, "sampling/importance_sampling_ratio/max": 1.2111561298370361, "sampling/importance_sampling_ratio/mean": 0.7650355696678162, "sampling/importance_sampling_ratio/min": 0.0003496279241517186, "sampling/sampling_logp_difference/max": 1.4383519887924194, "sampling/sampling_logp_difference/mean": 0.2026905119419098, "step": 1309, "step_time": 34.54663570498815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.355750948190689, "epoch": 0.0131, "grad_norm": 0.027027850970625877, "kl": 0.3534500300884247, "learning_rate": 7.999399092829233e-06, "loss": -0.0802, "step": 1310, "step_time": 14.067136272002244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.875, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9000548124313354, "epoch": 0.01311, "frac_reward_zero_std": 0.25, "grad_norm": 0.008933193981647491, "kl": 0.2805095072835684, "learning_rate": 7.999398149151204e-06, "loss": -0.0497, "num_tokens": 35318528.0, "reward": 0.6172538995742798, "reward_std": 1.317008137702942, "rewards/rollout_reward_func/mean": 0.6172538995742798, "rewards/rollout_reward_func/std": 1.317008137702942, "sampling/importance_sampling_ratio/max": 1.118285894393921, "sampling/importance_sampling_ratio/mean": 0.8736345767974854, "sampling/importance_sampling_ratio/min": 2.734356030487106e-07, "sampling/sampling_logp_difference/max": 2.245800018310547, "sampling/sampling_logp_difference/mean": 0.23803311586380005, "step": 1311, "step_time": 26.822687278996455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9013017872348428, "epoch": 0.01312, "grad_norm": 0.008602765388786793, "kl": 0.280419934540987, "learning_rate": 7.999397204732844e-06, "loss": -0.0497, "step": 1312, "step_time": 12.635728111985372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.9375, "completions/mean_terminated_length": 4.84615421295166, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.014218896627426, "epoch": 0.01313, "frac_reward_zero_std": 0.0, "grad_norm": 0.3186512291431427, "kl": 0.6766969030722976, "learning_rate": 7.999396259574152e-06, "loss": -0.0612, "num_tokens": 35378084.0, "reward": 0.7076592445373535, "reward_std": 1.1162203550338745, "rewards/rollout_reward_func/mean": 0.7076592445373535, "rewards/rollout_reward_func/std": 1.1162203550338745, "sampling/importance_sampling_ratio/max": 1.0998255014419556, "sampling/importance_sampling_ratio/mean": 0.6682165861129761, "sampling/importance_sampling_ratio/min": 1.3980946955705065e-10, "sampling/sampling_logp_difference/max": 2.7432050704956055, "sampling/sampling_logp_difference/mean": 0.42641860246658325, "step": 1313, "step_time": 30.619996748981066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010937500046566129, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010937500046566129, "entropy": 2.0211118338629603, "epoch": 0.01314, "grad_norm": 0.04730593040585518, "kl": 0.6572767067700624, "learning_rate": 7.99939531367513e-06, "loss": -0.0622, "step": 1314, "step_time": 14.003484501939965 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 7.1875, "completions/mean_terminated_length": 4.71999979019165, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9859238807111979, "epoch": 0.01315, "frac_reward_zero_std": 0.0, "grad_norm": 0.12478791922330856, "kl": 0.46525645442306995, "learning_rate": 7.999394367035778e-06, "loss": -0.0739, "num_tokens": 35438720.0, "reward": 0.8245512247085571, "reward_std": 1.2830755710601807, "rewards/rollout_reward_func/mean": 0.8245512247085571, "rewards/rollout_reward_func/std": 1.2830756902694702, "sampling/importance_sampling_ratio/max": 1.0419118404388428, "sampling/importance_sampling_ratio/mean": 0.6149064302444458, "sampling/importance_sampling_ratio/min": 3.0191532829348944e-08, "sampling/sampling_logp_difference/max": 2.0353236198425293, "sampling/sampling_logp_difference/mean": 0.4079541563987732, "step": 1315, "step_time": 32.04508905598777 }, { "clip_ratio/high_max": 0.036604021675884724, "clip_ratio/high_mean": 0.018302010837942362, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018302010837942362, "entropy": 1.9841905892826617, "epoch": 0.01316, "grad_norm": 0.0741630271077156, "kl": 0.42331097833812237, "learning_rate": 7.999393419656096e-06, "loss": -0.0745, "step": 1316, "step_time": 16.223278567980742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.96875, "completions/mean_terminated_length": 5.296296119689941, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6934963003732264, "epoch": 0.01317, "frac_reward_zero_std": 0.0, "grad_norm": 0.051213037222623825, "kl": 0.6817388003692031, "learning_rate": 7.999392471536084e-06, "loss": -0.0857, "num_tokens": 35493498.0, "reward": 0.8000015020370483, "reward_std": 1.089577078819275, "rewards/rollout_reward_func/mean": 0.8000015020370483, "rewards/rollout_reward_func/std": 1.0895769596099854, "sampling/importance_sampling_ratio/max": 1.24684739112854, "sampling/importance_sampling_ratio/mean": 0.7190604209899902, "sampling/importance_sampling_ratio/min": 2.537550790293608e-05, "sampling/sampling_logp_difference/max": 2.004565715789795, "sampling/sampling_logp_difference/mean": 0.308724045753479, "step": 1317, "step_time": 28.900689215020975 }, { "clip_ratio/high_max": 0.008928571827709675, "clip_ratio/high_mean": 0.004464285913854837, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "entropy": 1.6950911972671747, "epoch": 0.01318, "grad_norm": 0.053676534444093704, "kl": 0.6890613436698914, "learning_rate": 7.999391522675743e-06, "loss": -0.0856, "step": 1318, "step_time": 13.432912069023587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.53125, "completions/mean_terminated_length": 5.0625, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.9150107353925705, "epoch": 0.01319, "frac_reward_zero_std": 0.0, "grad_norm": 0.028731277212500572, "kl": 0.21013305708765984, "learning_rate": 7.999390573075073e-06, "loss": -0.0823, "num_tokens": 35557390.0, "reward": 0.074666827917099, "reward_std": 1.2025282382965088, "rewards/rollout_reward_func/mean": 0.074666827917099, "rewards/rollout_reward_func/std": 1.2025282382965088, "sampling/importance_sampling_ratio/max": 1.1268799304962158, "sampling/importance_sampling_ratio/mean": 0.40450745820999146, "sampling/importance_sampling_ratio/min": 3.227201617050923e-08, "sampling/sampling_logp_difference/max": 2.307253837585449, "sampling/sampling_logp_difference/mean": 0.4311642050743103, "step": 1319, "step_time": 34.47561634099111 }, { "clip_ratio/high_max": 0.008333333767950535, "clip_ratio/high_mean": 0.004166666883975267, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004166666883975267, "entropy": 2.91740445792675, "epoch": 0.0132, "grad_norm": 0.02840842306613922, "kl": 0.19070197734981775, "learning_rate": 7.999389622734072e-06, "loss": -0.0823, "step": 1320, "step_time": 14.202928608021466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.03125, "completions/mean_terminated_length": 4.91304349899292, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.312234543263912, "epoch": 0.01321, "frac_reward_zero_std": 0.25, "grad_norm": 0.07129265367984772, "kl": 0.25606844294816256, "learning_rate": 7.999388671652745e-06, "loss": -0.0715, "num_tokens": 35609162.0, "reward": 0.6019461154937744, "reward_std": 1.3457001447677612, "rewards/rollout_reward_func/mean": 0.6019461154937744, "rewards/rollout_reward_func/std": 1.3457002639770508, "sampling/importance_sampling_ratio/max": 1.2895514965057373, "sampling/importance_sampling_ratio/mean": 0.6394219398498535, "sampling/importance_sampling_ratio/min": 1.5476537829073322e-09, "sampling/sampling_logp_difference/max": 2.4131741523742676, "sampling/sampling_logp_difference/mean": 0.4425313174724579, "step": 1321, "step_time": 28.131763160025002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.305950790643692, "epoch": 0.01322, "grad_norm": 0.07244853675365448, "kl": 0.2506660707294941, "learning_rate": 7.999387719831088e-06, "loss": -0.0714, "step": 1322, "step_time": 12.259785092959646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.003289473708719015, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003289473708719015, "completions/clipped_ratio": 0.40625, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 8.84375, "completions/mean_terminated_length": 3.9473683834075928, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.443304643034935, "epoch": 0.01323, "frac_reward_zero_std": 0.25, "grad_norm": 0.014715832658112049, "kl": 0.13306101597845554, "learning_rate": 7.999386767269103e-06, "loss": -0.0654, "num_tokens": 35666365.0, "reward": 0.40191733837127686, "reward_std": 1.3421536684036255, "rewards/rollout_reward_func/mean": 0.40191733837127686, "rewards/rollout_reward_func/std": 1.3421536684036255, "sampling/importance_sampling_ratio/max": 1.1218215227127075, "sampling/importance_sampling_ratio/mean": 0.5781917572021484, "sampling/importance_sampling_ratio/min": 1.6429416049845713e-08, "sampling/sampling_logp_difference/max": 2.228963851928711, "sampling/sampling_logp_difference/mean": 0.4303499758243561, "step": 1323, "step_time": 33.38514674801263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.448492370545864, "epoch": 0.01324, "grad_norm": 0.040238674730062485, "kl": 0.1320202611386776, "learning_rate": 7.999385813966789e-06, "loss": -0.0653, "step": 1324, "step_time": 15.739626790949842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 8.65625, "completions/mean_terminated_length": 4.809524059295654, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.157968297600746, "epoch": 0.01325, "frac_reward_zero_std": 0.0, "grad_norm": 0.017872560769319534, "kl": 0.2798699578270316, "learning_rate": 7.999384859924148e-06, "loss": -0.0914, "num_tokens": 35727838.0, "reward": 0.5777438282966614, "reward_std": 1.1837657690048218, "rewards/rollout_reward_func/mean": 0.5777438282966614, "rewards/rollout_reward_func/std": 1.1837657690048218, "sampling/importance_sampling_ratio/max": 1.0922114849090576, "sampling/importance_sampling_ratio/mean": 0.5568954348564148, "sampling/importance_sampling_ratio/min": 1.5469486243091524e-05, "sampling/sampling_logp_difference/max": 1.8737527132034302, "sampling/sampling_logp_difference/mean": 0.3870890736579895, "step": 1325, "step_time": 37.094261574035045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.1566524505615234, "epoch": 0.01326, "grad_norm": 0.01818949170410633, "kl": 0.27994397934526205, "learning_rate": 7.99938390514118e-06, "loss": -0.0914, "step": 1326, "step_time": 16.06637531099841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.125, "completions/mean_terminated_length": 4.400000095367432, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.6443491755053401, "epoch": 0.01327, "frac_reward_zero_std": 0.25, "grad_norm": 0.010745033621788025, "kl": 0.36268465407192707, "learning_rate": 7.999382949617882e-06, "loss": -0.0543, "num_tokens": 35776893.0, "reward": 1.5995159149169922, "reward_std": 0.6975824236869812, "rewards/rollout_reward_func/mean": 1.5995159149169922, "rewards/rollout_reward_func/std": 0.697582483291626, "sampling/importance_sampling_ratio/max": 1.0947188138961792, "sampling/importance_sampling_ratio/mean": 0.8969196081161499, "sampling/importance_sampling_ratio/min": 0.001419540960341692, "sampling/sampling_logp_difference/max": 1.8141212463378906, "sampling/sampling_logp_difference/mean": 0.13008297979831696, "step": 1327, "step_time": 24.23607725693728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6423216937109828, "epoch": 0.01328, "grad_norm": 0.010750990360975266, "kl": 0.3617453519254923, "learning_rate": 7.99938199335426e-06, "loss": -0.0543, "step": 1328, "step_time": 12.153535710996948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 8.4375, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9765538843348622, "epoch": 0.01329, "frac_reward_zero_std": 0.0, "grad_norm": 0.043120384216308594, "kl": 0.22926984075456858, "learning_rate": 7.999381036350309e-06, "loss": -0.0643, "num_tokens": 35834933.0, "reward": 0.1916670948266983, "reward_std": 1.3414571285247803, "rewards/rollout_reward_func/mean": 0.1916670948266983, "rewards/rollout_reward_func/std": 1.3414571285247803, "sampling/importance_sampling_ratio/max": 1.15584397315979, "sampling/importance_sampling_ratio/mean": 0.5687905550003052, "sampling/importance_sampling_ratio/min": 1.337160160375106e-08, "sampling/sampling_logp_difference/max": 2.286635398864746, "sampling/sampling_logp_difference/mean": 0.39063894748687744, "step": 1329, "step_time": 40.02072807494551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9737213281914592, "epoch": 0.0133, "grad_norm": 0.04351195693016052, "kl": 0.22698942944407463, "learning_rate": 7.999380078606032e-06, "loss": -0.0643, "step": 1330, "step_time": 18.037636756023858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 8.53125, "completions/mean_terminated_length": 5.136363983154297, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.5292649269104004, "epoch": 0.01331, "frac_reward_zero_std": 0.0, "grad_norm": 0.059268634766340256, "kl": 0.36697367019951344, "learning_rate": 7.999379120121428e-06, "loss": -0.0695, "num_tokens": 35893429.0, "reward": 0.13284936547279358, "reward_std": 1.2972376346588135, "rewards/rollout_reward_func/mean": 0.13284936547279358, "rewards/rollout_reward_func/std": 1.2972376346588135, "sampling/importance_sampling_ratio/max": 1.151136875152588, "sampling/importance_sampling_ratio/mean": 0.5102232694625854, "sampling/importance_sampling_ratio/min": 5.980084551993059e-06, "sampling/sampling_logp_difference/max": 1.8690022230148315, "sampling/sampling_logp_difference/mean": 0.3921402096748352, "step": 1331, "step_time": 34.926843549037585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 2.531224697828293, "epoch": 0.01332, "grad_norm": 0.0326274149119854, "kl": 0.3554622484371066, "learning_rate": 7.999378160896498e-06, "loss": -0.0695, "step": 1332, "step_time": 13.59840260099736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 8.46875, "completions/mean_terminated_length": 5.958333492279053, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.8887563017196953, "epoch": 0.01333, "frac_reward_zero_std": 0.0, "grad_norm": 0.17279654741287231, "kl": 0.30337264481931925, "learning_rate": 7.999377200931244e-06, "loss": -0.0826, "num_tokens": 35957199.0, "reward": 0.2518877387046814, "reward_std": 1.1770044565200806, "rewards/rollout_reward_func/mean": 0.2518877387046814, "rewards/rollout_reward_func/std": 1.1770044565200806, "sampling/importance_sampling_ratio/max": 1.0828917026519775, "sampling/importance_sampling_ratio/mean": 0.5553889870643616, "sampling/importance_sampling_ratio/min": 3.165906673530117e-05, "sampling/sampling_logp_difference/max": 1.7085819244384766, "sampling/sampling_logp_difference/mean": 0.36471760272979736, "step": 1333, "step_time": 35.6555644099717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.008928571827709675, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008928571827709675, "entropy": 1.8873921986669302, "epoch": 0.01334, "grad_norm": 0.10892006009817123, "kl": 0.3262632070109248, "learning_rate": 7.999376240225662e-06, "loss": -0.0831, "step": 1334, "step_time": 15.713060912996298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.875, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.6900731334462762, "epoch": 0.01335, "frac_reward_zero_std": 0.5, "grad_norm": 0.016196060925722122, "kl": 0.2472805492579937, "learning_rate": 7.999375278779754e-06, "loss": -0.0306, "num_tokens": 36005047.0, "reward": 0.8470521569252014, "reward_std": 1.1509983539581299, "rewards/rollout_reward_func/mean": 0.8470521569252014, "rewards/rollout_reward_func/std": 1.1509983539581299, "sampling/importance_sampling_ratio/max": 1.2093819379806519, "sampling/importance_sampling_ratio/mean": 0.8777878284454346, "sampling/importance_sampling_ratio/min": 0.002542245201766491, "sampling/sampling_logp_difference/max": 1.16408109664917, "sampling/sampling_logp_difference/mean": 0.11262363940477371, "step": 1335, "step_time": 30.575428187061334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6905755549669266, "epoch": 0.01336, "grad_norm": 0.016060328111052513, "kl": 0.24725438002496958, "learning_rate": 7.999374316593522e-06, "loss": -0.0306, "step": 1336, "step_time": 13.744762472895673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.9375, "completions/mean_terminated_length": 5.259259223937988, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5365929063409567, "epoch": 0.01337, "frac_reward_zero_std": 0.5, "grad_norm": 0.026263324543833733, "kl": 0.1967252683825791, "learning_rate": 7.999373353666963e-06, "loss": -0.0522, "num_tokens": 36054622.0, "reward": 0.38589438796043396, "reward_std": 1.2214925289154053, "rewards/rollout_reward_func/mean": 0.38589438796043396, "rewards/rollout_reward_func/std": 1.2214925289154053, "sampling/importance_sampling_ratio/max": 1.268143653869629, "sampling/importance_sampling_ratio/mean": 0.7817904949188232, "sampling/importance_sampling_ratio/min": 3.1276826462089957e-07, "sampling/sampling_logp_difference/max": 2.0632684230804443, "sampling/sampling_logp_difference/mean": 0.29463881254196167, "step": 1337, "step_time": 27.08416605295497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.537487555295229, "epoch": 0.01338, "grad_norm": 0.024259459227323532, "kl": 0.1979524800553918, "learning_rate": 7.999372390000081e-06, "loss": -0.0523, "step": 1338, "step_time": 12.691417918947991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.1875, "completions/mean_terminated_length": 4.636363983154297, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9168497081845999, "epoch": 0.01339, "frac_reward_zero_std": 0.0, "grad_norm": 0.039198726415634155, "kl": 0.27277706004679203, "learning_rate": 7.999371425592876e-06, "loss": -0.0795, "num_tokens": 36104286.0, "reward": 0.776459276676178, "reward_std": 1.2185804843902588, "rewards/rollout_reward_func/mean": 0.776459276676178, "rewards/rollout_reward_func/std": 1.2185804843902588, "sampling/importance_sampling_ratio/max": 1.103912115097046, "sampling/importance_sampling_ratio/mean": 0.5874859094619751, "sampling/importance_sampling_ratio/min": 1.192530021398852e-07, "sampling/sampling_logp_difference/max": 2.1415412425994873, "sampling/sampling_logp_difference/mean": 0.38820838928222656, "step": 1339, "step_time": 26.923350947967265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.913594776764512, "epoch": 0.0134, "grad_norm": 0.03792183846235275, "kl": 0.2779960297048092, "learning_rate": 7.999370460445345e-06, "loss": -0.0796, "step": 1340, "step_time": 12.688690635055536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 11.4375, "completions/mean_terminated_length": 6.875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 3.280611753463745, "epoch": 0.01341, "frac_reward_zero_std": 0.0, "grad_norm": 0.0673803836107254, "kl": 0.18561555678024888, "learning_rate": 7.99936949455749e-06, "loss": -0.0504, "num_tokens": 36168871.0, "reward": -0.5550720691680908, "reward_std": 0.7637042999267578, "rewards/rollout_reward_func/mean": -0.5550720691680908, "rewards/rollout_reward_func/std": 0.763704240322113, "sampling/importance_sampling_ratio/max": 1.0838721990585327, "sampling/importance_sampling_ratio/mean": 0.23536032438278198, "sampling/importance_sampling_ratio/min": 1.9611375137174036e-06, "sampling/sampling_logp_difference/max": 2.0844764709472656, "sampling/sampling_logp_difference/mean": 0.4911661744117737, "step": 1341, "step_time": 37.5525126659486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.277737259864807, "epoch": 0.01342, "grad_norm": 0.0660753846168518, "kl": 0.17884658323600888, "learning_rate": 7.99936852792931e-06, "loss": -0.0506, "step": 1342, "step_time": 15.63478911301354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 5.75, "completions/mean_terminated_length": 4.285714626312256, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1149365822784603, "epoch": 0.01343, "frac_reward_zero_std": 0.0, "grad_norm": 0.018492745235562325, "kl": 0.3572459276765585, "learning_rate": 7.999367560560807e-06, "loss": -0.0608, "num_tokens": 36217548.0, "reward": 0.4765307605266571, "reward_std": 1.2128921747207642, "rewards/rollout_reward_func/mean": 0.4765307605266571, "rewards/rollout_reward_func/std": 1.2128921747207642, "sampling/importance_sampling_ratio/max": 1.1443308591842651, "sampling/importance_sampling_ratio/mean": 0.8760305643081665, "sampling/importance_sampling_ratio/min": 3.2656230359862093e-06, "sampling/sampling_logp_difference/max": 2.0240368843078613, "sampling/sampling_logp_difference/mean": 0.27097374200820923, "step": 1343, "step_time": 24.823781812970992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1162822903133929, "epoch": 0.01344, "grad_norm": 0.017641495913267136, "kl": 0.3581074010580778, "learning_rate": 7.999366592451981e-06, "loss": -0.0608, "step": 1344, "step_time": 12.06375230694539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.90625, "completions/mean_terminated_length": 4.360000133514404, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.469964905641973, "epoch": 0.01345, "frac_reward_zero_std": 0.25, "grad_norm": 0.06279217451810837, "kl": 0.22253119107335806, "learning_rate": 7.999365623602833e-06, "loss": -0.0333, "num_tokens": 36277053.0, "reward": 0.9623817801475525, "reward_std": 1.2043468952178955, "rewards/rollout_reward_func/mean": 0.9623817801475525, "rewards/rollout_reward_func/std": 1.2043468952178955, "sampling/importance_sampling_ratio/max": 1.4430476427078247, "sampling/importance_sampling_ratio/mean": 0.7916600704193115, "sampling/importance_sampling_ratio/min": 9.478048013988882e-06, "sampling/sampling_logp_difference/max": 1.6767361164093018, "sampling/sampling_logp_difference/mean": 0.3215630054473877, "step": 1345, "step_time": 30.93364536802983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4692562920972705, "epoch": 0.01346, "grad_norm": 0.06418043375015259, "kl": 0.22194064408540726, "learning_rate": 7.99936465401336e-06, "loss": -0.0334, "step": 1346, "step_time": 13.478770433051977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.28125, "completions/mean_terminated_length": 5.269230842590332, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8686454482376575, "epoch": 0.01347, "frac_reward_zero_std": 0.25, "grad_norm": 0.11884074658155441, "kl": 0.28574992157518864, "learning_rate": 7.999363683683565e-06, "loss": -0.0482, "num_tokens": 36325119.0, "reward": 0.07116562128067017, "reward_std": 1.3333001136779785, "rewards/rollout_reward_func/mean": 0.07116562128067017, "rewards/rollout_reward_func/std": 1.333299994468689, "sampling/importance_sampling_ratio/max": 1.1780281066894531, "sampling/importance_sampling_ratio/mean": 0.6906710267066956, "sampling/importance_sampling_ratio/min": 3.949352685594931e-05, "sampling/sampling_logp_difference/max": 1.8866914510726929, "sampling/sampling_logp_difference/mean": 0.3480004668235779, "step": 1347, "step_time": 27.092674096958945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.866161372512579, "epoch": 0.01348, "grad_norm": 0.12131452560424805, "kl": 0.29444681853055954, "learning_rate": 7.999362712613448e-06, "loss": -0.0485, "step": 1348, "step_time": 13.37325165499351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.96875, "completions/mean_terminated_length": 6.217391490936279, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.3918618708848953, "epoch": 0.01349, "frac_reward_zero_std": 0.0, "grad_norm": 0.09718141704797745, "kl": 0.13905132515355945, "learning_rate": 7.99936174080301e-06, "loss": -0.0977, "num_tokens": 36374794.0, "reward": -0.482493132352829, "reward_std": 1.0371261835098267, "rewards/rollout_reward_func/mean": -0.482493132352829, "rewards/rollout_reward_func/std": 1.0371261835098267, "sampling/importance_sampling_ratio/max": 1.128522276878357, "sampling/importance_sampling_ratio/mean": 0.592963457107544, "sampling/importance_sampling_ratio/min": 3.6558558349497616e-06, "sampling/sampling_logp_difference/max": 1.9819035530090332, "sampling/sampling_logp_difference/mean": 0.3885369896888733, "step": 1349, "step_time": 28.041701706999447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.3871405571699142, "epoch": 0.0135, "grad_norm": 0.09080036729574203, "kl": 0.13952969387173653, "learning_rate": 7.999360768252248e-06, "loss": -0.0979, "step": 1350, "step_time": 12.692069076991174 } ], "logging_steps": 1.0, "max_steps": 200000, "num_input_tokens_seen": 36374794, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }