{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.0005625056250562506, "eval_steps": 500, "global_step": 75, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1250.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 465.16668701171875, "completions/mean_terminated_length": 0.0, "completions/min_length": 128.0, "completions/min_terminated_length": 0.0, "entropy": 10.142642656962076, "epoch": 7.500075000750007e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.0, "loss": 0.0, "num_tokens": 27405.0, "reward": -55.42028045654297, "reward_std": 14.156389236450195, "rewards/rollout_reward_func/mean": -55.420284271240234, "rewards/rollout_reward_func/std": 14.949880599975586, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.271873474121094, "sampling/sampling_logp_difference/mean": 3.1089508533477783, "step": 1, "step_time": 20.82328461799989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 10.142642656962076, "epoch": 1.5000150001500015e-05, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 9.428571428571429e-07, "loss": 0.0, "step": 2, "step_time": 1.7083880239997598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1583.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 629.4583740234375, "completions/mean_terminated_length": 0.0, "completions/min_length": 128.0, "completions/min_terminated_length": 0.0, "entropy": 10.019515991210938, "epoch": 2.250022500225002e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.8857142857142858e-06, "loss": 0.0, "num_tokens": 58385.0, "reward": -44.9024772644043, "reward_std": 21.128307342529297, "rewards/rollout_reward_func/mean": -44.9024772644043, "rewards/rollout_reward_func/std": 20.625934600830078, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.504789352416992, "sampling/sampling_logp_difference/mean": 3.0316898822784424, "step": 3, "step_time": 21.70512620000045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 10.019515991210938, "epoch": 3.000030000300003e-05, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.8285714285714288e-06, "loss": 0.0, "step": 4, "step_time": 1.955569071001264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1297.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 617.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 128.0, "completions/min_terminated_length": 0.0, "entropy": 10.11839516957601, "epoch": 3.7500375003750034e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.7714285714285716e-06, "loss": 0.0, "num_tokens": 89832.0, "reward": -52.47894287109375, "reward_std": 14.024870872497559, "rewards/rollout_reward_func/mean": -52.47894287109375, "rewards/rollout_reward_func/std": 15.634257316589355, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 16.115659713745117, "sampling/sampling_logp_difference/mean": 3.077618360519409, "step": 5, "step_time": 20.300307900998632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 10.11839516957601, "epoch": 4.500045000450004e-05, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.714285714285715e-06, "loss": 0.0, "step": 6, "step_time": 1.7449414639986571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 669.3333740234375, "completions/mean_terminated_length": 0.0, "completions/min_length": 128.0, "completions/min_terminated_length": 0.0, "entropy": 9.932727654774984, "epoch": 5.250052500525005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 5.6571428571428576e-06, "loss": 0.0, "num_tokens": 121624.0, "reward": -39.97017288208008, "reward_std": 15.512527465820312, "rewards/rollout_reward_func/mean": -39.97017288208008, "rewards/rollout_reward_func/std": 18.193031311035156, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.845789909362793, "sampling/sampling_logp_difference/mean": 3.0627593994140625, "step": 7, "step_time": 23.506451233000007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 9.932727654774984, "epoch": 6.000060000600006e-05, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 6.600000000000001e-06, "loss": 0.0, "step": 8, "step_time": 2.360350576000201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9583333730697632, "completions/max_length": 1098.0, "completions/max_terminated_length": 117.0, "completions/mean_length": 474.8333435058594, "completions/mean_terminated_length": 117.0, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 10.16798194249471, "epoch": 6.750067500675007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 7.542857142857143e-06, "loss": 0.0, "num_tokens": 149483.0, "reward": -43.48188018798828, "reward_std": 17.433856964111328, "rewards/rollout_reward_func/mean": -43.481876373291016, "rewards/rollout_reward_func/std": 17.40215301513672, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.133698463439941, "sampling/sampling_logp_difference/mean": 3.2317543029785156, "step": 9, "step_time": 21.789242212999852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 10.16798194249471, "epoch": 7.500075000750007e-05, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 8.485714285714285e-06, "loss": 0.0, "step": 10, "step_time": 1.6317280789990036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1519.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 477.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 128.0, "completions/min_terminated_length": 0.0, "entropy": 10.115420500437418, "epoch": 8.250082500825008e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 9.42857142857143e-06, "loss": 0.0, "num_tokens": 177306.0, "reward": -49.76444625854492, "reward_std": 17.622684478759766, "rewards/rollout_reward_func/mean": -49.76444625854492, "rewards/rollout_reward_func/std": 17.78196907043457, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.594074249267578, "sampling/sampling_logp_difference/mean": 3.12507963180542, "step": 11, "step_time": 21.353458720001072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 10.115420500437418, "epoch": 9.000090000900009e-05, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.0371428571428572e-05, "loss": 0.0, "step": 12, "step_time": 1.9042143349997787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9583333730697632, "completions/max_length": 1335.0, "completions/max_terminated_length": 850.0, "completions/mean_length": 461.04168701171875, "completions/mean_terminated_length": 850.0, "completions/min_length": 128.0, "completions/min_terminated_length": 850.0, "entropy": 10.036355336507162, "epoch": 9.75009750097501e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.1314285714285715e-05, "loss": 0.0, "num_tokens": 204986.0, "reward": -49.08823776245117, "reward_std": 15.106697082519531, "rewards/rollout_reward_func/mean": -49.08823776245117, "rewards/rollout_reward_func/std": 15.08985710144043, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.450621604919434, "sampling/sampling_logp_difference/mean": 3.1363472938537598, "step": 13, "step_time": 21.459205156999815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 10.036355336507162, "epoch": 0.0001050010500105001, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.2257142857142858e-05, "loss": 0.0, "step": 14, "step_time": 1.7608847870005775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9583333730697632, "completions/max_length": 1320.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 546.9583740234375, "completions/mean_terminated_length": 199.0, "completions/min_length": 32.0, "completions/min_terminated_length": 199.0, "entropy": 10.22978941599528, "epoch": 0.00011250112501125012, "frac_reward_zero_std": 0.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.3200000000000002e-05, "loss": -0.0, "num_tokens": 234036.0, "reward": -45.839088439941406, "reward_std": 21.189510345458984, "rewards/rollout_reward_func/mean": -45.83908462524414, "rewards/rollout_reward_func/std": 21.63396644592285, "sampling/importance_sampling_ratio/max": 1.9689644722228005e-41, "sampling/importance_sampling_ratio/mean": 8.19759601630018e-43, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.650032043457031, "sampling/sampling_logp_difference/mean": 3.18168044090271, "step": 15, "step_time": 20.74833343499995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 10.22978941599528, "epoch": 0.00012000120001200012, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.4142857142857143e-05, "loss": -0.0, "step": 16, "step_time": 1.7528818500013585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1465.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 534.5416870117188, "completions/mean_terminated_length": 0.0, "completions/min_length": 128.0, "completions/min_terminated_length": 0.0, "entropy": 10.08104912439982, "epoch": 0.00012750127501275012, "frac_reward_zero_std": 0.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.5085714285714286e-05, "loss": 0.0, "num_tokens": 262530.0, "reward": -44.546234130859375, "reward_std": 21.182384490966797, "rewards/rollout_reward_func/mean": -44.546234130859375, "rewards/rollout_reward_func/std": 20.58904457092285, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.177563667297363, "sampling/sampling_logp_difference/mean": 3.1732728481292725, "step": 17, "step_time": 22.080905613998766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 10.08104912439982, "epoch": 0.00013500135001350013, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.602857142857143e-05, "loss": 0.0, "step": 18, "step_time": 1.857407227999829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1961.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 599.7083740234375, "completions/mean_terminated_length": 0.0, "completions/min_length": 207.0, "completions/min_terminated_length": 0.0, "entropy": 9.975990613301596, "epoch": 0.00014250142501425015, "frac_reward_zero_std": 0.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.697142857142857e-05, "loss": 0.0, "num_tokens": 293166.0, "reward": -48.2664909362793, "reward_std": 27.311491012573242, "rewards/rollout_reward_func/mean": -48.2664909362793, "rewards/rollout_reward_func/std": 26.92763328552246, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.656001091003418, "sampling/sampling_logp_difference/mean": 3.007814407348633, "step": 19, "step_time": 21.696408987000723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 9.975990613301596, "epoch": 0.00015000150001500014, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.7914285714285715e-05, "loss": 0.0, "step": 20, "step_time": 2.2191366179995384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1628.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 574.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 128.0, "completions/min_terminated_length": 0.0, "entropy": 10.216261068979898, "epoch": 0.00015750157501575015, "frac_reward_zero_std": 0.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.885714285714286e-05, "loss": 0.0, "num_tokens": 323248.0, "reward": -47.44938659667969, "reward_std": 18.540122985839844, "rewards/rollout_reward_func/mean": -47.44938659667969, "rewards/rollout_reward_func/std": 19.742595672607422, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.002847671508789, "sampling/sampling_logp_difference/mean": 3.182793617248535, "step": 21, "step_time": 21.242170976000125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 10.216261068979898, "epoch": 0.00016500165001650017, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.98e-05, "loss": 0.0, "step": 22, "step_time": 1.98696188500071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1505.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 566.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 128.0, "completions/min_terminated_length": 0.0, "entropy": 10.105806827545166, "epoch": 0.00017250172501725018, "frac_reward_zero_std": 0.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.0742857142857145e-05, "loss": 0.0, "num_tokens": 353295.0, "reward": -47.93909454345703, "reward_std": 21.568241119384766, "rewards/rollout_reward_func/mean": -47.9390983581543, "rewards/rollout_reward_func/std": 21.119380950927734, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.182104110717773, "sampling/sampling_logp_difference/mean": 3.005905866622925, "step": 23, "step_time": 21.250609902998804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 10.105806827545166, "epoch": 0.00018000180001800017, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.1685714285714286e-05, "loss": 0.0, "step": 24, "step_time": 1.9004714529983175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1371.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 490.4583435058594, "completions/mean_terminated_length": 0.0, "completions/min_length": 119.0, "completions/min_terminated_length": 0.0, "entropy": 10.06377108891805, "epoch": 0.00018750187501875019, "frac_reward_zero_std": 0.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.262857142857143e-05, "loss": 0.0, "num_tokens": 381535.0, "reward": -45.13935089111328, "reward_std": 19.477066040039062, "rewards/rollout_reward_func/mean": -45.13934326171875, "rewards/rollout_reward_func/std": 20.111757278442383, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.868183135986328, "sampling/sampling_logp_difference/mean": 3.132169246673584, "step": 25, "step_time": 20.922161000000415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 10.06377108891805, "epoch": 0.0001950019500195002, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.3571428571428575e-05, "loss": 0.0, "step": 26, "step_time": 1.8088220660001753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9583333730697632, "completions/max_length": 1358.0, "completions/max_terminated_length": 587.0, "completions/mean_length": 520.7916870117188, "completions/mean_terminated_length": 587.0, "completions/min_length": 128.0, "completions/min_terminated_length": 587.0, "entropy": 10.113564809163412, "epoch": 0.00020250202502025022, "frac_reward_zero_std": 0.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.4514285714285716e-05, "loss": 0.0, "num_tokens": 410593.0, "reward": -44.441925048828125, "reward_std": 25.591054916381836, "rewards/rollout_reward_func/mean": -44.441925048828125, "rewards/rollout_reward_func/std": 26.473356246948242, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.11213493347168, "sampling/sampling_logp_difference/mean": 3.1286377906799316, "step": 27, "step_time": 21.701382299001125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 10.113564809163412, "epoch": 0.0002100021000210002, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.545714285714286e-05, "loss": 0.0, "step": 28, "step_time": 1.7842601579986876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1249.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 547.2916870117188, "completions/mean_terminated_length": 0.0, "completions/min_length": 128.0, "completions/min_terminated_length": 0.0, "entropy": 10.082143624623617, "epoch": 0.00021750217502175022, "frac_reward_zero_std": 0.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.6400000000000005e-05, "loss": 0.0, "num_tokens": 440182.0, "reward": -45.103240966796875, "reward_std": 21.529327392578125, "rewards/rollout_reward_func/mean": -45.103240966796875, "rewards/rollout_reward_func/std": 20.876861572265625, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.8016996383667, "sampling/sampling_logp_difference/mean": 3.1039717197418213, "step": 29, "step_time": 21.481158237000272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 10.082143624623617, "epoch": 0.00022500225002250023, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.7342857142857146e-05, "loss": 0.0, "step": 30, "step_time": 2.1532514040018214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9583333730697632, "completions/max_length": 1320.0, "completions/max_terminated_length": 545.0, "completions/mean_length": 434.9583435058594, "completions/mean_terminated_length": 545.0, "completions/min_length": 128.0, "completions/min_terminated_length": 545.0, "entropy": 10.205902258555094, "epoch": 0.00023250232502325022, "frac_reward_zero_std": 0.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.8285714285714287e-05, "loss": 0.0, "num_tokens": 467213.0, "reward": -46.596885681152344, "reward_std": 16.725446701049805, "rewards/rollout_reward_func/mean": -46.59688186645508, "rewards/rollout_reward_func/std": 16.85680389404297, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.102117538452148, "sampling/sampling_logp_difference/mean": 3.278193712234497, "step": 31, "step_time": 21.168742055999246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 10.205902258555094, "epoch": 0.00024000240002400024, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.9228571428571428e-05, "loss": 0.0, "step": 32, "step_time": 1.750853970001117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1175.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 483.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 128.0, "completions/min_terminated_length": 0.0, "entropy": 10.10767682393392, "epoch": 0.00024750247502475025, "frac_reward_zero_std": 0.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.0171428571428572e-05, "loss": 0.0, "num_tokens": 494889.0, "reward": -54.67100143432617, "reward_std": 15.08906364440918, "rewards/rollout_reward_func/mean": -54.67100143432617, "rewards/rollout_reward_func/std": 14.835305213928223, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.920793533325195, "sampling/sampling_logp_difference/mean": 3.077173948287964, "step": 33, "step_time": 18.65274686000157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 10.10767682393392, "epoch": 0.00025500255002550024, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.1114285714285714e-05, "loss": 0.0, "step": 34, "step_time": 1.6550644490016566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1209.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 526.3333740234375, "completions/mean_terminated_length": 0.0, "completions/min_length": 128.0, "completions/min_terminated_length": 0.0, "entropy": 10.051785469055176, "epoch": 0.0002625026250262503, "frac_reward_zero_std": 0.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.205714285714286e-05, "loss": 0.0, "num_tokens": 524061.0, "reward": -45.748435974121094, "reward_std": 21.241294860839844, "rewards/rollout_reward_func/mean": -45.74843978881836, "rewards/rollout_reward_func/std": 21.725393295288086, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.848544120788574, "sampling/sampling_logp_difference/mean": 3.12214994430542, "step": 35, "step_time": 21.866697636000026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 10.051785469055176, "epoch": 0.00027000270002700027, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.3e-05, "loss": 0.0, "step": 36, "step_time": 1.6678288799994334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1409.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 479.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 117.0, "completions/min_terminated_length": 0.0, "entropy": 10.13319698969523, "epoch": 0.00027750277502775026, "frac_reward_zero_std": 0.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.299999999986258e-05, "loss": 0.0, "num_tokens": 551502.0, "reward": -49.105323791503906, "reward_std": 17.612197875976562, "rewards/rollout_reward_func/mean": -49.105316162109375, "rewards/rollout_reward_func/std": 17.828832626342773, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.797318458557129, "sampling/sampling_logp_difference/mean": 3.0804805755615234, "step": 37, "step_time": 21.77108188600141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 10.13319698969523, "epoch": 0.0002850028500285003, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.299999999945032e-05, "loss": 0.0, "step": 38, "step_time": 1.8092515669995919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1230.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 487.7083435058594, "completions/mean_terminated_length": 0.0, "completions/min_length": 128.0, "completions/min_terminated_length": 0.0, "entropy": 10.18622620900472, "epoch": 0.0002925029250292503, "frac_reward_zero_std": 0.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.299999999876322e-05, "loss": 0.0, "num_tokens": 579575.0, "reward": -52.96455001831055, "reward_std": 15.875425338745117, "rewards/rollout_reward_func/mean": -52.96455001831055, "rewards/rollout_reward_func/std": 15.511474609375, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.835306167602539, "sampling/sampling_logp_difference/mean": 3.1505439281463623, "step": 39, "step_time": 17.71383904600225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 10.18622620900472, "epoch": 0.0003000030000300003, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.299999999780127e-05, "loss": 0.0, "step": 40, "step_time": 1.6824162199991406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1092.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 420.2083435058594, "completions/mean_terminated_length": 0.0, "completions/min_length": 128.0, "completions/min_terminated_length": 0.0, "entropy": 10.118754863739014, "epoch": 0.0003075030750307503, "frac_reward_zero_std": 0.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.299999999656449e-05, "loss": 0.0, "num_tokens": 606180.0, "reward": -46.9930534362793, "reward_std": 20.2467041015625, "rewards/rollout_reward_func/mean": -46.9930534362793, "rewards/rollout_reward_func/std": 19.9199161529541, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.123610496520996, "sampling/sampling_logp_difference/mean": 3.206780195236206, "step": 41, "step_time": 21.204631595998762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 10.118754863739014, "epoch": 0.0003150031500315003, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.299999999505286e-05, "loss": 0.0, "step": 42, "step_time": 2.068030981999982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1052.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 458.91668701171875, "completions/mean_terminated_length": 0.0, "completions/min_length": 199.0, "completions/min_terminated_length": 0.0, "entropy": 10.044801712036133, "epoch": 0.00032250322503225035, "frac_reward_zero_std": 0.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.2999999993266396e-05, "loss": 0.0, "num_tokens": 632958.0, "reward": -49.451873779296875, "reward_std": 19.586355209350586, "rewards/rollout_reward_func/mean": -49.451873779296875, "rewards/rollout_reward_func/std": 19.92502784729004, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.691810607910156, "sampling/sampling_logp_difference/mean": 3.104357957839966, "step": 43, "step_time": 20.662249461999636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 10.044801712036133, "epoch": 0.00033000330003300033, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.299999999120509e-05, "loss": 0.0, "step": 44, "step_time": 1.6131189530015035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1517.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 572.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 128.0, "completions/min_terminated_length": 0.0, "entropy": 10.101013978322348, "epoch": 0.0003375033750337503, "frac_reward_zero_std": 0.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.299999998886893e-05, "loss": 0.0, "num_tokens": 662770.0, "reward": -43.31492233276367, "reward_std": 21.76721954345703, "rewards/rollout_reward_func/mean": -43.31492233276367, "rewards/rollout_reward_func/std": 21.667734146118164, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.86938190460205, "sampling/sampling_logp_difference/mean": 3.125654935836792, "step": 45, "step_time": 22.175719804000437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 10.101013978322348, "epoch": 0.00034500345003450036, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.299999998625794e-05, "loss": 0.0, "step": 46, "step_time": 1.9093814330008172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1180.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 510.8333435058594, "completions/mean_terminated_length": 0.0, "completions/min_length": 128.0, "completions/min_terminated_length": 0.0, "entropy": 10.153699239095053, "epoch": 0.00035250352503525035, "frac_reward_zero_std": 0.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.2999999983372116e-05, "loss": 0.0, "num_tokens": 691021.0, "reward": -52.171714782714844, "reward_std": 15.545321464538574, "rewards/rollout_reward_func/mean": -52.17171096801758, "rewards/rollout_reward_func/std": 17.01911735534668, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.297968864440918, "sampling/sampling_logp_difference/mean": 3.1078176498413086, "step": 47, "step_time": 18.611535334998734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 10.153699239095053, "epoch": 0.00036000360003600034, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.2999999980211436e-05, "loss": 0.0, "step": 48, "step_time": 1.658304570000837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1470.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 560.8333740234375, "completions/mean_terminated_length": 0.0, "completions/min_length": 128.0, "completions/min_terminated_length": 0.0, "entropy": 10.123283545176188, "epoch": 0.0003675036750367504, "frac_reward_zero_std": 0.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.2999999976775924e-05, "loss": 0.0, "num_tokens": 720151.0, "reward": -46.37849044799805, "reward_std": 20.132478713989258, "rewards/rollout_reward_func/mean": -46.37849044799805, "rewards/rollout_reward_func/std": 20.508827209472656, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.414722442626953, "sampling/sampling_logp_difference/mean": 3.121212959289551, "step": 49, "step_time": 20.764608049001254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 10.123283545176188, "epoch": 0.00037500375003750037, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.2999999973065575e-05, "loss": 0.0, "step": 50, "step_time": 1.8636344930000632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1200.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 597.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 128.0, "completions/min_terminated_length": 0.0, "entropy": 10.12819274266561, "epoch": 0.00038250382503825036, "frac_reward_zero_std": 0.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.299999996908037e-05, "loss": 0.0, "num_tokens": 751114.0, "reward": -45.675315856933594, "reward_std": 19.463340759277344, "rewards/rollout_reward_func/mean": -45.67531204223633, "rewards/rollout_reward_func/std": 19.729412078857422, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.647856712341309, "sampling/sampling_logp_difference/mean": 3.114715337753296, "step": 51, "step_time": 21.510169729001063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 10.12819274266561, "epoch": 0.0003900039000390004, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.299999996482033e-05, "loss": 0.0, "step": 52, "step_time": 1.669872710999698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9583333730697632, "completions/max_length": 1084.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 531.5833740234375, "completions/mean_terminated_length": 273.0, "completions/min_length": 128.0, "completions/min_terminated_length": 273.0, "entropy": 10.146964073181152, "epoch": 0.0003975039750397504, "frac_reward_zero_std": 0.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.299999996028545e-05, "loss": 0.0, "num_tokens": 779983.0, "reward": -46.03662109375, "reward_std": 19.839950561523438, "rewards/rollout_reward_func/mean": -46.03662109375, "rewards/rollout_reward_func/std": 21.031444549560547, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.309167861938477, "sampling/sampling_logp_difference/mean": 3.155137777328491, "step": 53, "step_time": 22.573047874000622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 10.146964073181152, "epoch": 0.00040500405004050043, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.2999999955475736e-05, "loss": 0.0, "step": 54, "step_time": 1.612867594999443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1096.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 520.2083740234375, "completions/mean_terminated_length": 0.0, "completions/min_length": 128.0, "completions/min_terminated_length": 0.0, "entropy": 10.004105885823568, "epoch": 0.0004125041250412504, "frac_reward_zero_std": 0.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.299999995039117e-05, "loss": 0.0, "num_tokens": 808253.0, "reward": -50.46704864501953, "reward_std": 17.44925308227539, "rewards/rollout_reward_func/mean": -50.467044830322266, "rewards/rollout_reward_func/std": 16.909671783447266, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.991758346557617, "sampling/sampling_logp_difference/mean": 3.0269718170166016, "step": 55, "step_time": 20.465181108998877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 10.004105885823568, "epoch": 0.0004200042000420004, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.299999994503177e-05, "loss": 0.0, "step": 56, "step_time": 1.6323036710000451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1126.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 500.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 128.0, "completions/min_terminated_length": 0.0, "entropy": 10.095836321512857, "epoch": 0.00042750427504275045, "frac_reward_zero_std": 0.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.299999993939753e-05, "loss": 0.0, "num_tokens": 836831.0, "reward": -47.74729537963867, "reward_std": 17.828411102294922, "rewards/rollout_reward_func/mean": -47.74729537963867, "rewards/rollout_reward_func/std": 17.13072395324707, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.644685745239258, "sampling/sampling_logp_difference/mean": 3.2078018188476562, "step": 57, "step_time": 20.712939544999244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 10.095836321512857, "epoch": 0.00043500435004350044, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.299999993348844e-05, "loss": 0.0, "step": 58, "step_time": 1.6430280170015976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1045.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 562.9166870117188, "completions/mean_terminated_length": 0.0, "completions/min_length": 201.0, "completions/min_terminated_length": 0.0, "entropy": 10.027901649475098, "epoch": 0.0004425044250442504, "frac_reward_zero_std": 0.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.299999992730452e-05, "loss": 0.0, "num_tokens": 866235.0, "reward": -54.31364059448242, "reward_std": 16.991008758544922, "rewards/rollout_reward_func/mean": -54.31363296508789, "rewards/rollout_reward_func/std": 16.77975082397461, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.359244346618652, "sampling/sampling_logp_difference/mean": 2.992715835571289, "step": 59, "step_time": 18.364870030999555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 10.027901649475098, "epoch": 0.00045000450004500047, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.299999992084575e-05, "loss": 0.0, "step": 60, "step_time": 1.6135677440006475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1237.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 583.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 116.0, "completions/min_terminated_length": 0.0, "entropy": 10.05492369333903, "epoch": 0.00045750457504575045, "frac_reward_zero_std": 0.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.299999991411214e-05, "loss": 0.0, "num_tokens": 896524.0, "reward": -43.963714599609375, "reward_std": 17.849592208862305, "rewards/rollout_reward_func/mean": -43.963714599609375, "rewards/rollout_reward_func/std": 19.37680435180664, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.769742012023926, "sampling/sampling_logp_difference/mean": 3.1108696460723877, "step": 61, "step_time": 23.026458983998964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 10.05492369333903, "epoch": 0.00046500465004650044, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.299999990710369e-05, "loss": 0.0, "step": 62, "step_time": 1.680401769999662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1481.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 542.0833740234375, "completions/mean_terminated_length": 0.0, "completions/min_length": 128.0, "completions/min_terminated_length": 0.0, "entropy": 9.974092801411947, "epoch": 0.0004725047250472505, "frac_reward_zero_std": 0.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.2999999899820406e-05, "loss": 0.0, "num_tokens": 925351.0, "reward": -47.469757080078125, "reward_std": 16.11382484436035, "rewards/rollout_reward_func/mean": -47.469757080078125, "rewards/rollout_reward_func/std": 16.915918350219727, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.969949722290039, "sampling/sampling_logp_difference/mean": 3.0506622791290283, "step": 63, "step_time": 22.066013471999213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 9.974092801411947, "epoch": 0.00048000480004800047, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.299999989226227e-05, "loss": 0.0, "step": 64, "step_time": 2.3155115009976726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1072.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 535.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 128.0, "completions/min_terminated_length": 0.0, "entropy": 10.121665000915527, "epoch": 0.0004875048750487505, "frac_reward_zero_std": 0.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.2999999884429296e-05, "loss": 0.0, "num_tokens": 954640.0, "reward": -47.75408172607422, "reward_std": 16.36182975769043, "rewards/rollout_reward_func/mean": -47.75407791137695, "rewards/rollout_reward_func/std": 19.085969924926758, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.185979843139648, "sampling/sampling_logp_difference/mean": 3.126000165939331, "step": 65, "step_time": 19.784562730999824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 10.121665000915527, "epoch": 0.0004950049500495005, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.299999987632149e-05, "loss": 0.0, "step": 66, "step_time": 1.618170978001217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1122.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 500.8333435058594, "completions/mean_terminated_length": 0.0, "completions/min_length": 128.0, "completions/min_terminated_length": 0.0, "entropy": 10.120317935943604, "epoch": 0.0005025050250502505, "frac_reward_zero_std": 0.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.299999986793883e-05, "loss": 0.0, "num_tokens": 982892.0, "reward": -53.79902267456055, "reward_std": 15.497881889343262, "rewards/rollout_reward_func/mean": -53.79902267456055, "rewards/rollout_reward_func/std": 17.754192352294922, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.600709915161133, "sampling/sampling_logp_difference/mean": 3.1041884422302246, "step": 67, "step_time": 19.112115680999523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 10.120317935943604, "epoch": 0.0005100051000510005, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.2999999859281335e-05, "loss": 0.0, "step": 68, "step_time": 1.6533779979999963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1001.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 545.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 202.0, "completions/min_terminated_length": 0.0, "entropy": 9.890536467234293, "epoch": 0.0005175051750517505, "frac_reward_zero_std": 0.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.2999999850349e-05, "loss": 0.0, "num_tokens": 1012345.0, "reward": -59.80120849609375, "reward_std": 9.581160545349121, "rewards/rollout_reward_func/mean": -59.80120849609375, "rewards/rollout_reward_func/std": 10.057110786437988, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.439180374145508, "sampling/sampling_logp_difference/mean": 2.835329532623291, "step": 69, "step_time": 18.08387734600001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 9.890536467234293, "epoch": 0.0005250052500525006, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.299999984114182e-05, "loss": 0.0, "step": 70, "step_time": 1.6130178840003282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1181.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 559.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 128.0, "completions/min_terminated_length": 0.0, "entropy": 9.979540348052979, "epoch": 0.0005325053250532505, "frac_reward_zero_std": 0.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.29999998316598e-05, "loss": 0.0, "num_tokens": 1041646.0, "reward": -48.28936004638672, "reward_std": 18.08314323425293, "rewards/rollout_reward_func/mean": -48.28935623168945, "rewards/rollout_reward_func/std": 17.737323760986328, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.49035930633545, "sampling/sampling_logp_difference/mean": 3.0069854259490967, "step": 71, "step_time": 20.73087897000096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 9.979540348052979, "epoch": 0.0005400054000540005, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.2999999821902936e-05, "loss": 0.0, "step": 72, "step_time": 1.673831482999958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1129.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 479.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 125.0, "completions/min_terminated_length": 0.0, "entropy": 9.990771611531576, "epoch": 0.0005475054750547505, "frac_reward_zero_std": 0.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.2999999811871234e-05, "loss": 0.0, "num_tokens": 1069537.0, "reward": -46.58069610595703, "reward_std": 21.29560661315918, "rewards/rollout_reward_func/mean": -46.5806884765625, "rewards/rollout_reward_func/std": 21.10353660583496, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.358684539794922, "sampling/sampling_logp_difference/mean": 3.0600218772888184, "step": 73, "step_time": 20.745255362002354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 9.990771611531576, "epoch": 0.0005550055500555005, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.299999980156469e-05, "loss": 0.0, "step": 74, "step_time": 1.65707921000012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1571.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 533.5833740234375, "completions/mean_terminated_length": 0.0, "completions/min_length": 128.0, "completions/min_terminated_length": 0.0, "entropy": 10.04587491353353, "epoch": 0.0005625056250562506, "frac_reward_zero_std": 0.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.2999999790983306e-05, "loss": 0.0, "num_tokens": 1098538.0, "reward": -43.66608428955078, "reward_std": 16.150455474853516, "rewards/rollout_reward_func/mean": -43.66608810424805, "rewards/rollout_reward_func/std": 18.750696182250977, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.10157299041748, "sampling/sampling_logp_difference/mean": 3.069899320602417, "step": 75, "step_time": 23.273216087998662 } ], "logging_steps": 1.0, "max_steps": 666660, "num_input_tokens_seen": 1098538, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }