| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.0005625056250562506, |
| "eval_steps": 500, |
| "global_step": 75, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 1250.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 465.16668701171875, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 128.0, |
| "completions/min_terminated_length": 0.0, |
| "entropy": 10.142642656962076, |
| "epoch": 7.500075000750007e-06, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 0.0, |
| "loss": 0.0, |
| "num_tokens": 27405.0, |
| "reward": -55.42028045654297, |
| "reward_std": 14.156389236450195, |
| "rewards/rollout_reward_func/mean": -55.420284271240234, |
| "rewards/rollout_reward_func/std": 14.949880599975586, |
| "sampling/importance_sampling_ratio/max": 0.0, |
| "sampling/importance_sampling_ratio/mean": 0.0, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 15.271873474121094, |
| "sampling/sampling_logp_difference/mean": 3.1089508533477783, |
| "step": 1, |
| "step_time": 20.82328461799989 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 10.142642656962076, |
| "epoch": 1.5000150001500015e-05, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 9.428571428571429e-07, |
| "loss": 0.0, |
| "step": 2, |
| "step_time": 1.7083880239997598 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 1583.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 629.4583740234375, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 128.0, |
| "completions/min_terminated_length": 0.0, |
| "entropy": 10.019515991210938, |
| "epoch": 2.250022500225002e-05, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 1.8857142857142858e-06, |
| "loss": 0.0, |
| "num_tokens": 58385.0, |
| "reward": -44.9024772644043, |
| "reward_std": 21.128307342529297, |
| "rewards/rollout_reward_func/mean": -44.9024772644043, |
| "rewards/rollout_reward_func/std": 20.625934600830078, |
| "sampling/importance_sampling_ratio/max": 0.0, |
| "sampling/importance_sampling_ratio/mean": 0.0, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 15.504789352416992, |
| "sampling/sampling_logp_difference/mean": 3.0316898822784424, |
| "step": 3, |
| "step_time": 21.70512620000045 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 10.019515991210938, |
| "epoch": 3.000030000300003e-05, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 2.8285714285714288e-06, |
| "loss": 0.0, |
| "step": 4, |
| "step_time": 1.955569071001264 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 1297.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 617.125, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 128.0, |
| "completions/min_terminated_length": 0.0, |
| "entropy": 10.11839516957601, |
| "epoch": 3.7500375003750034e-05, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 3.7714285714285716e-06, |
| "loss": 0.0, |
| "num_tokens": 89832.0, |
| "reward": -52.47894287109375, |
| "reward_std": 14.024870872497559, |
| "rewards/rollout_reward_func/mean": -52.47894287109375, |
| "rewards/rollout_reward_func/std": 15.634257316589355, |
| "sampling/importance_sampling_ratio/max": 0.0, |
| "sampling/importance_sampling_ratio/mean": 0.0, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 16.115659713745117, |
| "sampling/sampling_logp_difference/mean": 3.077618360519409, |
| "step": 5, |
| "step_time": 20.300307900998632 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 10.11839516957601, |
| "epoch": 4.500045000450004e-05, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 4.714285714285715e-06, |
| "loss": 0.0, |
| "step": 6, |
| "step_time": 1.7449414639986571 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 1512.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 669.3333740234375, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 128.0, |
| "completions/min_terminated_length": 0.0, |
| "entropy": 9.932727654774984, |
| "epoch": 5.250052500525005e-05, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 5.6571428571428576e-06, |
| "loss": 0.0, |
| "num_tokens": 121624.0, |
| "reward": -39.97017288208008, |
| "reward_std": 15.512527465820312, |
| "rewards/rollout_reward_func/mean": -39.97017288208008, |
| "rewards/rollout_reward_func/std": 18.193031311035156, |
| "sampling/importance_sampling_ratio/max": 0.0, |
| "sampling/importance_sampling_ratio/mean": 0.0, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 14.845789909362793, |
| "sampling/sampling_logp_difference/mean": 3.0627593994140625, |
| "step": 7, |
| "step_time": 23.506451233000007 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 9.932727654774984, |
| "epoch": 6.000060000600006e-05, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 6.600000000000001e-06, |
| "loss": 0.0, |
| "step": 8, |
| "step_time": 2.360350576000201 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.9583333730697632, |
| "completions/max_length": 1098.0, |
| "completions/max_terminated_length": 117.0, |
| "completions/mean_length": 474.8333435058594, |
| "completions/mean_terminated_length": 117.0, |
| "completions/min_length": 117.0, |
| "completions/min_terminated_length": 117.0, |
| "entropy": 10.16798194249471, |
| "epoch": 6.750067500675007e-05, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 7.542857142857143e-06, |
| "loss": 0.0, |
| "num_tokens": 149483.0, |
| "reward": -43.48188018798828, |
| "reward_std": 17.433856964111328, |
| "rewards/rollout_reward_func/mean": -43.481876373291016, |
| "rewards/rollout_reward_func/std": 17.40215301513672, |
| "sampling/importance_sampling_ratio/max": 0.0, |
| "sampling/importance_sampling_ratio/mean": 0.0, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 15.133698463439941, |
| "sampling/sampling_logp_difference/mean": 3.2317543029785156, |
| "step": 9, |
| "step_time": 21.789242212999852 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 10.16798194249471, |
| "epoch": 7.500075000750007e-05, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 8.485714285714285e-06, |
| "loss": 0.0, |
| "step": 10, |
| "step_time": 1.6317280789990036 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 1519.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 477.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 128.0, |
| "completions/min_terminated_length": 0.0, |
| "entropy": 10.115420500437418, |
| "epoch": 8.250082500825008e-05, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 9.42857142857143e-06, |
| "loss": 0.0, |
| "num_tokens": 177306.0, |
| "reward": -49.76444625854492, |
| "reward_std": 17.622684478759766, |
| "rewards/rollout_reward_func/mean": -49.76444625854492, |
| "rewards/rollout_reward_func/std": 17.78196907043457, |
| "sampling/importance_sampling_ratio/max": 0.0, |
| "sampling/importance_sampling_ratio/mean": 0.0, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 14.594074249267578, |
| "sampling/sampling_logp_difference/mean": 3.12507963180542, |
| "step": 11, |
| "step_time": 21.353458720001072 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 10.115420500437418, |
| "epoch": 9.000090000900009e-05, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 1.0371428571428572e-05, |
| "loss": 0.0, |
| "step": 12, |
| "step_time": 1.9042143349997787 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.9583333730697632, |
| "completions/max_length": 1335.0, |
| "completions/max_terminated_length": 850.0, |
| "completions/mean_length": 461.04168701171875, |
| "completions/mean_terminated_length": 850.0, |
| "completions/min_length": 128.0, |
| "completions/min_terminated_length": 850.0, |
| "entropy": 10.036355336507162, |
| "epoch": 9.75009750097501e-05, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 1.1314285714285715e-05, |
| "loss": 0.0, |
| "num_tokens": 204986.0, |
| "reward": -49.08823776245117, |
| "reward_std": 15.106697082519531, |
| "rewards/rollout_reward_func/mean": -49.08823776245117, |
| "rewards/rollout_reward_func/std": 15.08985710144043, |
| "sampling/importance_sampling_ratio/max": 0.0, |
| "sampling/importance_sampling_ratio/mean": 0.0, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 15.450621604919434, |
| "sampling/sampling_logp_difference/mean": 3.1363472938537598, |
| "step": 13, |
| "step_time": 21.459205156999815 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 10.036355336507162, |
| "epoch": 0.0001050010500105001, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 1.2257142857142858e-05, |
| "loss": 0.0, |
| "step": 14, |
| "step_time": 1.7608847870005775 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.9583333730697632, |
| "completions/max_length": 1320.0, |
| "completions/max_terminated_length": 199.0, |
| "completions/mean_length": 546.9583740234375, |
| "completions/mean_terminated_length": 199.0, |
| "completions/min_length": 32.0, |
| "completions/min_terminated_length": 199.0, |
| "entropy": 10.22978941599528, |
| "epoch": 0.00011250112501125012, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 1.3200000000000002e-05, |
| "loss": -0.0, |
| "num_tokens": 234036.0, |
| "reward": -45.839088439941406, |
| "reward_std": 21.189510345458984, |
| "rewards/rollout_reward_func/mean": -45.83908462524414, |
| "rewards/rollout_reward_func/std": 21.63396644592285, |
| "sampling/importance_sampling_ratio/max": 1.9689644722228005e-41, |
| "sampling/importance_sampling_ratio/mean": 8.19759601630018e-43, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 14.650032043457031, |
| "sampling/sampling_logp_difference/mean": 3.18168044090271, |
| "step": 15, |
| "step_time": 20.74833343499995 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 10.22978941599528, |
| "epoch": 0.00012000120001200012, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 1.4142857142857143e-05, |
| "loss": -0.0, |
| "step": 16, |
| "step_time": 1.7528818500013585 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 1465.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 534.5416870117188, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 128.0, |
| "completions/min_terminated_length": 0.0, |
| "entropy": 10.08104912439982, |
| "epoch": 0.00012750127501275012, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 1.5085714285714286e-05, |
| "loss": 0.0, |
| "num_tokens": 262530.0, |
| "reward": -44.546234130859375, |
| "reward_std": 21.182384490966797, |
| "rewards/rollout_reward_func/mean": -44.546234130859375, |
| "rewards/rollout_reward_func/std": 20.58904457092285, |
| "sampling/importance_sampling_ratio/max": 0.0, |
| "sampling/importance_sampling_ratio/mean": 0.0, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 15.177563667297363, |
| "sampling/sampling_logp_difference/mean": 3.1732728481292725, |
| "step": 17, |
| "step_time": 22.080905613998766 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 10.08104912439982, |
| "epoch": 0.00013500135001350013, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 1.602857142857143e-05, |
| "loss": 0.0, |
| "step": 18, |
| "step_time": 1.857407227999829 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 1961.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 599.7083740234375, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 207.0, |
| "completions/min_terminated_length": 0.0, |
| "entropy": 9.975990613301596, |
| "epoch": 0.00014250142501425015, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 1.697142857142857e-05, |
| "loss": 0.0, |
| "num_tokens": 293166.0, |
| "reward": -48.2664909362793, |
| "reward_std": 27.311491012573242, |
| "rewards/rollout_reward_func/mean": -48.2664909362793, |
| "rewards/rollout_reward_func/std": 26.92763328552246, |
| "sampling/importance_sampling_ratio/max": 0.0, |
| "sampling/importance_sampling_ratio/mean": 0.0, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 13.656001091003418, |
| "sampling/sampling_logp_difference/mean": 3.007814407348633, |
| "step": 19, |
| "step_time": 21.696408987000723 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 9.975990613301596, |
| "epoch": 0.00015000150001500014, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 1.7914285714285715e-05, |
| "loss": 0.0, |
| "step": 20, |
| "step_time": 2.2191366179995384 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 1628.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 574.875, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 128.0, |
| "completions/min_terminated_length": 0.0, |
| "entropy": 10.216261068979898, |
| "epoch": 0.00015750157501575015, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 1.885714285714286e-05, |
| "loss": 0.0, |
| "num_tokens": 323248.0, |
| "reward": -47.44938659667969, |
| "reward_std": 18.540122985839844, |
| "rewards/rollout_reward_func/mean": -47.44938659667969, |
| "rewards/rollout_reward_func/std": 19.742595672607422, |
| "sampling/importance_sampling_ratio/max": 0.0, |
| "sampling/importance_sampling_ratio/mean": 0.0, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 15.002847671508789, |
| "sampling/sampling_logp_difference/mean": 3.182793617248535, |
| "step": 21, |
| "step_time": 21.242170976000125 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 10.216261068979898, |
| "epoch": 0.00016500165001650017, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 1.98e-05, |
| "loss": 0.0, |
| "step": 22, |
| "step_time": 1.98696188500071 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 1505.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 566.75, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 128.0, |
| "completions/min_terminated_length": 0.0, |
| "entropy": 10.105806827545166, |
| "epoch": 0.00017250172501725018, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 2.0742857142857145e-05, |
| "loss": 0.0, |
| "num_tokens": 353295.0, |
| "reward": -47.93909454345703, |
| "reward_std": 21.568241119384766, |
| "rewards/rollout_reward_func/mean": -47.9390983581543, |
| "rewards/rollout_reward_func/std": 21.119380950927734, |
| "sampling/importance_sampling_ratio/max": 0.0, |
| "sampling/importance_sampling_ratio/mean": 0.0, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 15.182104110717773, |
| "sampling/sampling_logp_difference/mean": 3.005905866622925, |
| "step": 23, |
| "step_time": 21.250609902998804 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 10.105806827545166, |
| "epoch": 0.00018000180001800017, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 2.1685714285714286e-05, |
| "loss": 0.0, |
| "step": 24, |
| "step_time": 1.9004714529983175 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 1371.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 490.4583435058594, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 119.0, |
| "completions/min_terminated_length": 0.0, |
| "entropy": 10.06377108891805, |
| "epoch": 0.00018750187501875019, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 2.262857142857143e-05, |
| "loss": 0.0, |
| "num_tokens": 381535.0, |
| "reward": -45.13935089111328, |
| "reward_std": 19.477066040039062, |
| "rewards/rollout_reward_func/mean": -45.13934326171875, |
| "rewards/rollout_reward_func/std": 20.111757278442383, |
| "sampling/importance_sampling_ratio/max": 0.0, |
| "sampling/importance_sampling_ratio/mean": 0.0, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 14.868183135986328, |
| "sampling/sampling_logp_difference/mean": 3.132169246673584, |
| "step": 25, |
| "step_time": 20.922161000000415 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 10.06377108891805, |
| "epoch": 0.0001950019500195002, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 2.3571428571428575e-05, |
| "loss": 0.0, |
| "step": 26, |
| "step_time": 1.8088220660001753 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.9583333730697632, |
| "completions/max_length": 1358.0, |
| "completions/max_terminated_length": 587.0, |
| "completions/mean_length": 520.7916870117188, |
| "completions/mean_terminated_length": 587.0, |
| "completions/min_length": 128.0, |
| "completions/min_terminated_length": 587.0, |
| "entropy": 10.113564809163412, |
| "epoch": 0.00020250202502025022, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 2.4514285714285716e-05, |
| "loss": 0.0, |
| "num_tokens": 410593.0, |
| "reward": -44.441925048828125, |
| "reward_std": 25.591054916381836, |
| "rewards/rollout_reward_func/mean": -44.441925048828125, |
| "rewards/rollout_reward_func/std": 26.473356246948242, |
| "sampling/importance_sampling_ratio/max": 0.0, |
| "sampling/importance_sampling_ratio/mean": 0.0, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 14.11213493347168, |
| "sampling/sampling_logp_difference/mean": 3.1286377906799316, |
| "step": 27, |
| "step_time": 21.701382299001125 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 10.113564809163412, |
| "epoch": 0.0002100021000210002, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 2.545714285714286e-05, |
| "loss": 0.0, |
| "step": 28, |
| "step_time": 1.7842601579986876 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 1249.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 547.2916870117188, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 128.0, |
| "completions/min_terminated_length": 0.0, |
| "entropy": 10.082143624623617, |
| "epoch": 0.00021750217502175022, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 2.6400000000000005e-05, |
| "loss": 0.0, |
| "num_tokens": 440182.0, |
| "reward": -45.103240966796875, |
| "reward_std": 21.529327392578125, |
| "rewards/rollout_reward_func/mean": -45.103240966796875, |
| "rewards/rollout_reward_func/std": 20.876861572265625, |
| "sampling/importance_sampling_ratio/max": 0.0, |
| "sampling/importance_sampling_ratio/mean": 0.0, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 14.8016996383667, |
| "sampling/sampling_logp_difference/mean": 3.1039717197418213, |
| "step": 29, |
| "step_time": 21.481158237000272 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 10.082143624623617, |
| "epoch": 0.00022500225002250023, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 2.7342857142857146e-05, |
| "loss": 0.0, |
| "step": 30, |
| "step_time": 2.1532514040018214 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.9583333730697632, |
| "completions/max_length": 1320.0, |
| "completions/max_terminated_length": 545.0, |
| "completions/mean_length": 434.9583435058594, |
| "completions/mean_terminated_length": 545.0, |
| "completions/min_length": 128.0, |
| "completions/min_terminated_length": 545.0, |
| "entropy": 10.205902258555094, |
| "epoch": 0.00023250232502325022, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 2.8285714285714287e-05, |
| "loss": 0.0, |
| "num_tokens": 467213.0, |
| "reward": -46.596885681152344, |
| "reward_std": 16.725446701049805, |
| "rewards/rollout_reward_func/mean": -46.59688186645508, |
| "rewards/rollout_reward_func/std": 16.85680389404297, |
| "sampling/importance_sampling_ratio/max": 0.0, |
| "sampling/importance_sampling_ratio/mean": 0.0, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 15.102117538452148, |
| "sampling/sampling_logp_difference/mean": 3.278193712234497, |
| "step": 31, |
| "step_time": 21.168742055999246 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 10.205902258555094, |
| "epoch": 0.00024000240002400024, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 2.9228571428571428e-05, |
| "loss": 0.0, |
| "step": 32, |
| "step_time": 1.750853970001117 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 1175.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 483.5, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 128.0, |
| "completions/min_terminated_length": 0.0, |
| "entropy": 10.10767682393392, |
| "epoch": 0.00024750247502475025, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 3.0171428571428572e-05, |
| "loss": 0.0, |
| "num_tokens": 494889.0, |
| "reward": -54.67100143432617, |
| "reward_std": 15.08906364440918, |
| "rewards/rollout_reward_func/mean": -54.67100143432617, |
| "rewards/rollout_reward_func/std": 14.835305213928223, |
| "sampling/importance_sampling_ratio/max": 0.0, |
| "sampling/importance_sampling_ratio/mean": 0.0, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 13.920793533325195, |
| "sampling/sampling_logp_difference/mean": 3.077173948287964, |
| "step": 33, |
| "step_time": 18.65274686000157 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 10.10767682393392, |
| "epoch": 0.00025500255002550024, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 3.1114285714285714e-05, |
| "loss": 0.0, |
| "step": 34, |
| "step_time": 1.6550644490016566 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 1209.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 526.3333740234375, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 128.0, |
| "completions/min_terminated_length": 0.0, |
| "entropy": 10.051785469055176, |
| "epoch": 0.0002625026250262503, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 3.205714285714286e-05, |
| "loss": 0.0, |
| "num_tokens": 524061.0, |
| "reward": -45.748435974121094, |
| "reward_std": 21.241294860839844, |
| "rewards/rollout_reward_func/mean": -45.74843978881836, |
| "rewards/rollout_reward_func/std": 21.725393295288086, |
| "sampling/importance_sampling_ratio/max": 0.0, |
| "sampling/importance_sampling_ratio/mean": 0.0, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 15.848544120788574, |
| "sampling/sampling_logp_difference/mean": 3.12214994430542, |
| "step": 35, |
| "step_time": 21.866697636000026 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 10.051785469055176, |
| "epoch": 0.00027000270002700027, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 3.3e-05, |
| "loss": 0.0, |
| "step": 36, |
| "step_time": 1.6678288799994334 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 1409.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 479.75, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 117.0, |
| "completions/min_terminated_length": 0.0, |
| "entropy": 10.13319698969523, |
| "epoch": 0.00027750277502775026, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 3.299999999986258e-05, |
| "loss": 0.0, |
| "num_tokens": 551502.0, |
| "reward": -49.105323791503906, |
| "reward_std": 17.612197875976562, |
| "rewards/rollout_reward_func/mean": -49.105316162109375, |
| "rewards/rollout_reward_func/std": 17.828832626342773, |
| "sampling/importance_sampling_ratio/max": 0.0, |
| "sampling/importance_sampling_ratio/mean": 0.0, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 14.797318458557129, |
| "sampling/sampling_logp_difference/mean": 3.0804805755615234, |
| "step": 37, |
| "step_time": 21.77108188600141 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 10.13319698969523, |
| "epoch": 0.0002850028500285003, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 3.299999999945032e-05, |
| "loss": 0.0, |
| "step": 38, |
| "step_time": 1.8092515669995919 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 1230.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 487.7083435058594, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 128.0, |
| "completions/min_terminated_length": 0.0, |
| "entropy": 10.18622620900472, |
| "epoch": 0.0002925029250292503, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 3.299999999876322e-05, |
| "loss": 0.0, |
| "num_tokens": 579575.0, |
| "reward": -52.96455001831055, |
| "reward_std": 15.875425338745117, |
| "rewards/rollout_reward_func/mean": -52.96455001831055, |
| "rewards/rollout_reward_func/std": 15.511474609375, |
| "sampling/importance_sampling_ratio/max": 0.0, |
| "sampling/importance_sampling_ratio/mean": 0.0, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 14.835306167602539, |
| "sampling/sampling_logp_difference/mean": 3.1505439281463623, |
| "step": 39, |
| "step_time": 17.71383904600225 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 10.18622620900472, |
| "epoch": 0.0003000030000300003, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 3.299999999780127e-05, |
| "loss": 0.0, |
| "step": 40, |
| "step_time": 1.6824162199991406 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 1092.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 420.2083435058594, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 128.0, |
| "completions/min_terminated_length": 0.0, |
| "entropy": 10.118754863739014, |
| "epoch": 0.0003075030750307503, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 3.299999999656449e-05, |
| "loss": 0.0, |
| "num_tokens": 606180.0, |
| "reward": -46.9930534362793, |
| "reward_std": 20.2467041015625, |
| "rewards/rollout_reward_func/mean": -46.9930534362793, |
| "rewards/rollout_reward_func/std": 19.9199161529541, |
| "sampling/importance_sampling_ratio/max": 0.0, |
| "sampling/importance_sampling_ratio/mean": 0.0, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 15.123610496520996, |
| "sampling/sampling_logp_difference/mean": 3.206780195236206, |
| "step": 41, |
| "step_time": 21.204631595998762 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 10.118754863739014, |
| "epoch": 0.0003150031500315003, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 3.299999999505286e-05, |
| "loss": 0.0, |
| "step": 42, |
| "step_time": 2.068030981999982 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 1052.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 458.91668701171875, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 199.0, |
| "completions/min_terminated_length": 0.0, |
| "entropy": 10.044801712036133, |
| "epoch": 0.00032250322503225035, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 3.2999999993266396e-05, |
| "loss": 0.0, |
| "num_tokens": 632958.0, |
| "reward": -49.451873779296875, |
| "reward_std": 19.586355209350586, |
| "rewards/rollout_reward_func/mean": -49.451873779296875, |
| "rewards/rollout_reward_func/std": 19.92502784729004, |
| "sampling/importance_sampling_ratio/max": 0.0, |
| "sampling/importance_sampling_ratio/mean": 0.0, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 14.691810607910156, |
| "sampling/sampling_logp_difference/mean": 3.104357957839966, |
| "step": 43, |
| "step_time": 20.662249461999636 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 10.044801712036133, |
| "epoch": 0.00033000330003300033, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 3.299999999120509e-05, |
| "loss": 0.0, |
| "step": 44, |
| "step_time": 1.6131189530015035 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 1517.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 572.25, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 128.0, |
| "completions/min_terminated_length": 0.0, |
| "entropy": 10.101013978322348, |
| "epoch": 0.0003375033750337503, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 3.299999998886893e-05, |
| "loss": 0.0, |
| "num_tokens": 662770.0, |
| "reward": -43.31492233276367, |
| "reward_std": 21.76721954345703, |
| "rewards/rollout_reward_func/mean": -43.31492233276367, |
| "rewards/rollout_reward_func/std": 21.667734146118164, |
| "sampling/importance_sampling_ratio/max": 0.0, |
| "sampling/importance_sampling_ratio/mean": 0.0, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 14.86938190460205, |
| "sampling/sampling_logp_difference/mean": 3.125654935836792, |
| "step": 45, |
| "step_time": 22.175719804000437 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 10.101013978322348, |
| "epoch": 0.00034500345003450036, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 3.299999998625794e-05, |
| "loss": 0.0, |
| "step": 46, |
| "step_time": 1.9093814330008172 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 1180.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 510.8333435058594, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 128.0, |
| "completions/min_terminated_length": 0.0, |
| "entropy": 10.153699239095053, |
| "epoch": 0.00035250352503525035, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 3.2999999983372116e-05, |
| "loss": 0.0, |
| "num_tokens": 691021.0, |
| "reward": -52.171714782714844, |
| "reward_std": 15.545321464538574, |
| "rewards/rollout_reward_func/mean": -52.17171096801758, |
| "rewards/rollout_reward_func/std": 17.01911735534668, |
| "sampling/importance_sampling_ratio/max": 0.0, |
| "sampling/importance_sampling_ratio/mean": 0.0, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 14.297968864440918, |
| "sampling/sampling_logp_difference/mean": 3.1078176498413086, |
| "step": 47, |
| "step_time": 18.611535334998734 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 10.153699239095053, |
| "epoch": 0.00036000360003600034, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 3.2999999980211436e-05, |
| "loss": 0.0, |
| "step": 48, |
| "step_time": 1.658304570000837 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 1470.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 560.8333740234375, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 128.0, |
| "completions/min_terminated_length": 0.0, |
| "entropy": 10.123283545176188, |
| "epoch": 0.0003675036750367504, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 3.2999999976775924e-05, |
| "loss": 0.0, |
| "num_tokens": 720151.0, |
| "reward": -46.37849044799805, |
| "reward_std": 20.132478713989258, |
| "rewards/rollout_reward_func/mean": -46.37849044799805, |
| "rewards/rollout_reward_func/std": 20.508827209472656, |
| "sampling/importance_sampling_ratio/max": 0.0, |
| "sampling/importance_sampling_ratio/mean": 0.0, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 14.414722442626953, |
| "sampling/sampling_logp_difference/mean": 3.121212959289551, |
| "step": 49, |
| "step_time": 20.764608049001254 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 10.123283545176188, |
| "epoch": 0.00037500375003750037, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 3.2999999973065575e-05, |
| "loss": 0.0, |
| "step": 50, |
| "step_time": 1.8636344930000632 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 1200.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 597.5, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 128.0, |
| "completions/min_terminated_length": 0.0, |
| "entropy": 10.12819274266561, |
| "epoch": 0.00038250382503825036, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 3.299999996908037e-05, |
| "loss": 0.0, |
| "num_tokens": 751114.0, |
| "reward": -45.675315856933594, |
| "reward_std": 19.463340759277344, |
| "rewards/rollout_reward_func/mean": -45.67531204223633, |
| "rewards/rollout_reward_func/std": 19.729412078857422, |
| "sampling/importance_sampling_ratio/max": 0.0, |
| "sampling/importance_sampling_ratio/mean": 0.0, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 14.647856712341309, |
| "sampling/sampling_logp_difference/mean": 3.114715337753296, |
| "step": 51, |
| "step_time": 21.510169729001063 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 10.12819274266561, |
| "epoch": 0.0003900039000390004, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 3.299999996482033e-05, |
| "loss": 0.0, |
| "step": 52, |
| "step_time": 1.669872710999698 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.9583333730697632, |
| "completions/max_length": 1084.0, |
| "completions/max_terminated_length": 273.0, |
| "completions/mean_length": 531.5833740234375, |
| "completions/mean_terminated_length": 273.0, |
| "completions/min_length": 128.0, |
| "completions/min_terminated_length": 273.0, |
| "entropy": 10.146964073181152, |
| "epoch": 0.0003975039750397504, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 3.299999996028545e-05, |
| "loss": 0.0, |
| "num_tokens": 779983.0, |
| "reward": -46.03662109375, |
| "reward_std": 19.839950561523438, |
| "rewards/rollout_reward_func/mean": -46.03662109375, |
| "rewards/rollout_reward_func/std": 21.031444549560547, |
| "sampling/importance_sampling_ratio/max": 0.0, |
| "sampling/importance_sampling_ratio/mean": 0.0, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 14.309167861938477, |
| "sampling/sampling_logp_difference/mean": 3.155137777328491, |
| "step": 53, |
| "step_time": 22.573047874000622 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 10.146964073181152, |
| "epoch": 0.00040500405004050043, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 3.2999999955475736e-05, |
| "loss": 0.0, |
| "step": 54, |
| "step_time": 1.612867594999443 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 1096.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 520.2083740234375, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 128.0, |
| "completions/min_terminated_length": 0.0, |
| "entropy": 10.004105885823568, |
| "epoch": 0.0004125041250412504, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 3.299999995039117e-05, |
| "loss": 0.0, |
| "num_tokens": 808253.0, |
| "reward": -50.46704864501953, |
| "reward_std": 17.44925308227539, |
| "rewards/rollout_reward_func/mean": -50.467044830322266, |
| "rewards/rollout_reward_func/std": 16.909671783447266, |
| "sampling/importance_sampling_ratio/max": 0.0, |
| "sampling/importance_sampling_ratio/mean": 0.0, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 13.991758346557617, |
| "sampling/sampling_logp_difference/mean": 3.0269718170166016, |
| "step": 55, |
| "step_time": 20.465181108998877 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 10.004105885823568, |
| "epoch": 0.0004200042000420004, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 3.299999994503177e-05, |
| "loss": 0.0, |
| "step": 56, |
| "step_time": 1.6323036710000451 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 1126.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 500.25, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 128.0, |
| "completions/min_terminated_length": 0.0, |
| "entropy": 10.095836321512857, |
| "epoch": 0.00042750427504275045, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 3.299999993939753e-05, |
| "loss": 0.0, |
| "num_tokens": 836831.0, |
| "reward": -47.74729537963867, |
| "reward_std": 17.828411102294922, |
| "rewards/rollout_reward_func/mean": -47.74729537963867, |
| "rewards/rollout_reward_func/std": 17.13072395324707, |
| "sampling/importance_sampling_ratio/max": 0.0, |
| "sampling/importance_sampling_ratio/mean": 0.0, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 15.644685745239258, |
| "sampling/sampling_logp_difference/mean": 3.2078018188476562, |
| "step": 57, |
| "step_time": 20.712939544999244 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 10.095836321512857, |
| "epoch": 0.00043500435004350044, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 3.299999993348844e-05, |
| "loss": 0.0, |
| "step": 58, |
| "step_time": 1.6430280170015976 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 1045.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 562.9166870117188, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 201.0, |
| "completions/min_terminated_length": 0.0, |
| "entropy": 10.027901649475098, |
| "epoch": 0.0004425044250442504, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 3.299999992730452e-05, |
| "loss": 0.0, |
| "num_tokens": 866235.0, |
| "reward": -54.31364059448242, |
| "reward_std": 16.991008758544922, |
| "rewards/rollout_reward_func/mean": -54.31363296508789, |
| "rewards/rollout_reward_func/std": 16.77975082397461, |
| "sampling/importance_sampling_ratio/max": 0.0, |
| "sampling/importance_sampling_ratio/mean": 0.0, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 15.359244346618652, |
| "sampling/sampling_logp_difference/mean": 2.992715835571289, |
| "step": 59, |
| "step_time": 18.364870030999555 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 10.027901649475098, |
| "epoch": 0.00045000450004500047, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 3.299999992084575e-05, |
| "loss": 0.0, |
| "step": 60, |
| "step_time": 1.6135677440006475 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 1237.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 583.125, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 116.0, |
| "completions/min_terminated_length": 0.0, |
| "entropy": 10.05492369333903, |
| "epoch": 0.00045750457504575045, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 3.299999991411214e-05, |
| "loss": 0.0, |
| "num_tokens": 896524.0, |
| "reward": -43.963714599609375, |
| "reward_std": 17.849592208862305, |
| "rewards/rollout_reward_func/mean": -43.963714599609375, |
| "rewards/rollout_reward_func/std": 19.37680435180664, |
| "sampling/importance_sampling_ratio/max": 0.0, |
| "sampling/importance_sampling_ratio/mean": 0.0, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 14.769742012023926, |
| "sampling/sampling_logp_difference/mean": 3.1108696460723877, |
| "step": 61, |
| "step_time": 23.026458983998964 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 10.05492369333903, |
| "epoch": 0.00046500465004650044, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 3.299999990710369e-05, |
| "loss": 0.0, |
| "step": 62, |
| "step_time": 1.680401769999662 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 1481.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 542.0833740234375, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 128.0, |
| "completions/min_terminated_length": 0.0, |
| "entropy": 9.974092801411947, |
| "epoch": 0.0004725047250472505, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 3.2999999899820406e-05, |
| "loss": 0.0, |
| "num_tokens": 925351.0, |
| "reward": -47.469757080078125, |
| "reward_std": 16.11382484436035, |
| "rewards/rollout_reward_func/mean": -47.469757080078125, |
| "rewards/rollout_reward_func/std": 16.915918350219727, |
| "sampling/importance_sampling_ratio/max": 0.0, |
| "sampling/importance_sampling_ratio/mean": 0.0, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 14.969949722290039, |
| "sampling/sampling_logp_difference/mean": 3.0506622791290283, |
| "step": 63, |
| "step_time": 22.066013471999213 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 9.974092801411947, |
| "epoch": 0.00048000480004800047, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 3.299999989226227e-05, |
| "loss": 0.0, |
| "step": 64, |
| "step_time": 2.3155115009976726 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 1072.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 535.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 128.0, |
| "completions/min_terminated_length": 0.0, |
| "entropy": 10.121665000915527, |
| "epoch": 0.0004875048750487505, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 3.2999999884429296e-05, |
| "loss": 0.0, |
| "num_tokens": 954640.0, |
| "reward": -47.75408172607422, |
| "reward_std": 16.36182975769043, |
| "rewards/rollout_reward_func/mean": -47.75407791137695, |
| "rewards/rollout_reward_func/std": 19.085969924926758, |
| "sampling/importance_sampling_ratio/max": 0.0, |
| "sampling/importance_sampling_ratio/mean": 0.0, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 14.185979843139648, |
| "sampling/sampling_logp_difference/mean": 3.126000165939331, |
| "step": 65, |
| "step_time": 19.784562730999824 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 10.121665000915527, |
| "epoch": 0.0004950049500495005, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 3.299999987632149e-05, |
| "loss": 0.0, |
| "step": 66, |
| "step_time": 1.618170978001217 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 1122.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 500.8333435058594, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 128.0, |
| "completions/min_terminated_length": 0.0, |
| "entropy": 10.120317935943604, |
| "epoch": 0.0005025050250502505, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 3.299999986793883e-05, |
| "loss": 0.0, |
| "num_tokens": 982892.0, |
| "reward": -53.79902267456055, |
| "reward_std": 15.497881889343262, |
| "rewards/rollout_reward_func/mean": -53.79902267456055, |
| "rewards/rollout_reward_func/std": 17.754192352294922, |
| "sampling/importance_sampling_ratio/max": 0.0, |
| "sampling/importance_sampling_ratio/mean": 0.0, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 13.600709915161133, |
| "sampling/sampling_logp_difference/mean": 3.1041884422302246, |
| "step": 67, |
| "step_time": 19.112115680999523 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 10.120317935943604, |
| "epoch": 0.0005100051000510005, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 3.2999999859281335e-05, |
| "loss": 0.0, |
| "step": 68, |
| "step_time": 1.6533779979999963 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 1001.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 545.875, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 202.0, |
| "completions/min_terminated_length": 0.0, |
| "entropy": 9.890536467234293, |
| "epoch": 0.0005175051750517505, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 3.2999999850349e-05, |
| "loss": 0.0, |
| "num_tokens": 1012345.0, |
| "reward": -59.80120849609375, |
| "reward_std": 9.581160545349121, |
| "rewards/rollout_reward_func/mean": -59.80120849609375, |
| "rewards/rollout_reward_func/std": 10.057110786437988, |
| "sampling/importance_sampling_ratio/max": 0.0, |
| "sampling/importance_sampling_ratio/mean": 0.0, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 13.439180374145508, |
| "sampling/sampling_logp_difference/mean": 2.835329532623291, |
| "step": 69, |
| "step_time": 18.08387734600001 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 9.890536467234293, |
| "epoch": 0.0005250052500525006, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 3.299999984114182e-05, |
| "loss": 0.0, |
| "step": 70, |
| "step_time": 1.6130178840003282 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 1181.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 559.75, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 128.0, |
| "completions/min_terminated_length": 0.0, |
| "entropy": 9.979540348052979, |
| "epoch": 0.0005325053250532505, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 3.29999998316598e-05, |
| "loss": 0.0, |
| "num_tokens": 1041646.0, |
| "reward": -48.28936004638672, |
| "reward_std": 18.08314323425293, |
| "rewards/rollout_reward_func/mean": -48.28935623168945, |
| "rewards/rollout_reward_func/std": 17.737323760986328, |
| "sampling/importance_sampling_ratio/max": 0.0, |
| "sampling/importance_sampling_ratio/mean": 0.0, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 14.49035930633545, |
| "sampling/sampling_logp_difference/mean": 3.0069854259490967, |
| "step": 71, |
| "step_time": 20.73087897000096 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 9.979540348052979, |
| "epoch": 0.0005400054000540005, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 3.2999999821902936e-05, |
| "loss": 0.0, |
| "step": 72, |
| "step_time": 1.673831482999958 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 1129.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 479.875, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 125.0, |
| "completions/min_terminated_length": 0.0, |
| "entropy": 9.990771611531576, |
| "epoch": 0.0005475054750547505, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 3.2999999811871234e-05, |
| "loss": 0.0, |
| "num_tokens": 1069537.0, |
| "reward": -46.58069610595703, |
| "reward_std": 21.29560661315918, |
| "rewards/rollout_reward_func/mean": -46.5806884765625, |
| "rewards/rollout_reward_func/std": 21.10353660583496, |
| "sampling/importance_sampling_ratio/max": 0.0, |
| "sampling/importance_sampling_ratio/mean": 0.0, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 15.358684539794922, |
| "sampling/sampling_logp_difference/mean": 3.0600218772888184, |
| "step": 73, |
| "step_time": 20.745255362002354 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "entropy": 9.990771611531576, |
| "epoch": 0.0005550055500555005, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 3.299999980156469e-05, |
| "loss": 0.0, |
| "step": 74, |
| "step_time": 1.65707921000012 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 1571.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 533.5833740234375, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 128.0, |
| "completions/min_terminated_length": 0.0, |
| "entropy": 10.04587491353353, |
| "epoch": 0.0005625056250562506, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 3.2999999790983306e-05, |
| "loss": 0.0, |
| "num_tokens": 1098538.0, |
| "reward": -43.66608428955078, |
| "reward_std": 16.150455474853516, |
| "rewards/rollout_reward_func/mean": -43.66608810424805, |
| "rewards/rollout_reward_func/std": 18.750696182250977, |
| "sampling/importance_sampling_ratio/max": 0.0, |
| "sampling/importance_sampling_ratio/mean": 0.0, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 15.10157299041748, |
| "sampling/sampling_logp_difference/mean": 3.069899320602417, |
| "step": 75, |
| "step_time": 23.273216087998662 |
| } |
| ], |
| "logging_steps": 1.0, |
| "max_steps": 666660, |
| "num_input_tokens_seen": 1098538, |
| "num_train_epochs": 5, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|