env_daa33b0 / trainer_state.json
bimabk's picture
Upload task output 1
5b6407b verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.0005625056250562506,
"eval_steps": 500,
"global_step": 75,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 1250.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 465.16668701171875,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 128.0,
"completions/min_terminated_length": 0.0,
"entropy": 10.142642656962076,
"epoch": 7.500075000750007e-06,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 0.0,
"loss": 0.0,
"num_tokens": 27405.0,
"reward": -55.42028045654297,
"reward_std": 14.156389236450195,
"rewards/rollout_reward_func/mean": -55.420284271240234,
"rewards/rollout_reward_func/std": 14.949880599975586,
"sampling/importance_sampling_ratio/max": 0.0,
"sampling/importance_sampling_ratio/mean": 0.0,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 15.271873474121094,
"sampling/sampling_logp_difference/mean": 3.1089508533477783,
"step": 1,
"step_time": 20.82328461799989
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 10.142642656962076,
"epoch": 1.5000150001500015e-05,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 9.428571428571429e-07,
"loss": 0.0,
"step": 2,
"step_time": 1.7083880239997598
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 1583.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 629.4583740234375,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 128.0,
"completions/min_terminated_length": 0.0,
"entropy": 10.019515991210938,
"epoch": 2.250022500225002e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 1.8857142857142858e-06,
"loss": 0.0,
"num_tokens": 58385.0,
"reward": -44.9024772644043,
"reward_std": 21.128307342529297,
"rewards/rollout_reward_func/mean": -44.9024772644043,
"rewards/rollout_reward_func/std": 20.625934600830078,
"sampling/importance_sampling_ratio/max": 0.0,
"sampling/importance_sampling_ratio/mean": 0.0,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 15.504789352416992,
"sampling/sampling_logp_difference/mean": 3.0316898822784424,
"step": 3,
"step_time": 21.70512620000045
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 10.019515991210938,
"epoch": 3.000030000300003e-05,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 2.8285714285714288e-06,
"loss": 0.0,
"step": 4,
"step_time": 1.955569071001264
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 1297.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 617.125,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 128.0,
"completions/min_terminated_length": 0.0,
"entropy": 10.11839516957601,
"epoch": 3.7500375003750034e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 3.7714285714285716e-06,
"loss": 0.0,
"num_tokens": 89832.0,
"reward": -52.47894287109375,
"reward_std": 14.024870872497559,
"rewards/rollout_reward_func/mean": -52.47894287109375,
"rewards/rollout_reward_func/std": 15.634257316589355,
"sampling/importance_sampling_ratio/max": 0.0,
"sampling/importance_sampling_ratio/mean": 0.0,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 16.115659713745117,
"sampling/sampling_logp_difference/mean": 3.077618360519409,
"step": 5,
"step_time": 20.300307900998632
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 10.11839516957601,
"epoch": 4.500045000450004e-05,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 4.714285714285715e-06,
"loss": 0.0,
"step": 6,
"step_time": 1.7449414639986571
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 1512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 669.3333740234375,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 128.0,
"completions/min_terminated_length": 0.0,
"entropy": 9.932727654774984,
"epoch": 5.250052500525005e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 5.6571428571428576e-06,
"loss": 0.0,
"num_tokens": 121624.0,
"reward": -39.97017288208008,
"reward_std": 15.512527465820312,
"rewards/rollout_reward_func/mean": -39.97017288208008,
"rewards/rollout_reward_func/std": 18.193031311035156,
"sampling/importance_sampling_ratio/max": 0.0,
"sampling/importance_sampling_ratio/mean": 0.0,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 14.845789909362793,
"sampling/sampling_logp_difference/mean": 3.0627593994140625,
"step": 7,
"step_time": 23.506451233000007
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 9.932727654774984,
"epoch": 6.000060000600006e-05,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 6.600000000000001e-06,
"loss": 0.0,
"step": 8,
"step_time": 2.360350576000201
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.9583333730697632,
"completions/max_length": 1098.0,
"completions/max_terminated_length": 117.0,
"completions/mean_length": 474.8333435058594,
"completions/mean_terminated_length": 117.0,
"completions/min_length": 117.0,
"completions/min_terminated_length": 117.0,
"entropy": 10.16798194249471,
"epoch": 6.750067500675007e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 7.542857142857143e-06,
"loss": 0.0,
"num_tokens": 149483.0,
"reward": -43.48188018798828,
"reward_std": 17.433856964111328,
"rewards/rollout_reward_func/mean": -43.481876373291016,
"rewards/rollout_reward_func/std": 17.40215301513672,
"sampling/importance_sampling_ratio/max": 0.0,
"sampling/importance_sampling_ratio/mean": 0.0,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 15.133698463439941,
"sampling/sampling_logp_difference/mean": 3.2317543029785156,
"step": 9,
"step_time": 21.789242212999852
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 10.16798194249471,
"epoch": 7.500075000750007e-05,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 8.485714285714285e-06,
"loss": 0.0,
"step": 10,
"step_time": 1.6317280789990036
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 1519.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 477.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 128.0,
"completions/min_terminated_length": 0.0,
"entropy": 10.115420500437418,
"epoch": 8.250082500825008e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 9.42857142857143e-06,
"loss": 0.0,
"num_tokens": 177306.0,
"reward": -49.76444625854492,
"reward_std": 17.622684478759766,
"rewards/rollout_reward_func/mean": -49.76444625854492,
"rewards/rollout_reward_func/std": 17.78196907043457,
"sampling/importance_sampling_ratio/max": 0.0,
"sampling/importance_sampling_ratio/mean": 0.0,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 14.594074249267578,
"sampling/sampling_logp_difference/mean": 3.12507963180542,
"step": 11,
"step_time": 21.353458720001072
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 10.115420500437418,
"epoch": 9.000090000900009e-05,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 1.0371428571428572e-05,
"loss": 0.0,
"step": 12,
"step_time": 1.9042143349997787
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.9583333730697632,
"completions/max_length": 1335.0,
"completions/max_terminated_length": 850.0,
"completions/mean_length": 461.04168701171875,
"completions/mean_terminated_length": 850.0,
"completions/min_length": 128.0,
"completions/min_terminated_length": 850.0,
"entropy": 10.036355336507162,
"epoch": 9.75009750097501e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 1.1314285714285715e-05,
"loss": 0.0,
"num_tokens": 204986.0,
"reward": -49.08823776245117,
"reward_std": 15.106697082519531,
"rewards/rollout_reward_func/mean": -49.08823776245117,
"rewards/rollout_reward_func/std": 15.08985710144043,
"sampling/importance_sampling_ratio/max": 0.0,
"sampling/importance_sampling_ratio/mean": 0.0,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 15.450621604919434,
"sampling/sampling_logp_difference/mean": 3.1363472938537598,
"step": 13,
"step_time": 21.459205156999815
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 10.036355336507162,
"epoch": 0.0001050010500105001,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 1.2257142857142858e-05,
"loss": 0.0,
"step": 14,
"step_time": 1.7608847870005775
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.9583333730697632,
"completions/max_length": 1320.0,
"completions/max_terminated_length": 199.0,
"completions/mean_length": 546.9583740234375,
"completions/mean_terminated_length": 199.0,
"completions/min_length": 32.0,
"completions/min_terminated_length": 199.0,
"entropy": 10.22978941599528,
"epoch": 0.00011250112501125012,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 1.3200000000000002e-05,
"loss": -0.0,
"num_tokens": 234036.0,
"reward": -45.839088439941406,
"reward_std": 21.189510345458984,
"rewards/rollout_reward_func/mean": -45.83908462524414,
"rewards/rollout_reward_func/std": 21.63396644592285,
"sampling/importance_sampling_ratio/max": 1.9689644722228005e-41,
"sampling/importance_sampling_ratio/mean": 8.19759601630018e-43,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 14.650032043457031,
"sampling/sampling_logp_difference/mean": 3.18168044090271,
"step": 15,
"step_time": 20.74833343499995
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 10.22978941599528,
"epoch": 0.00012000120001200012,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 1.4142857142857143e-05,
"loss": -0.0,
"step": 16,
"step_time": 1.7528818500013585
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 1465.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 534.5416870117188,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 128.0,
"completions/min_terminated_length": 0.0,
"entropy": 10.08104912439982,
"epoch": 0.00012750127501275012,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 1.5085714285714286e-05,
"loss": 0.0,
"num_tokens": 262530.0,
"reward": -44.546234130859375,
"reward_std": 21.182384490966797,
"rewards/rollout_reward_func/mean": -44.546234130859375,
"rewards/rollout_reward_func/std": 20.58904457092285,
"sampling/importance_sampling_ratio/max": 0.0,
"sampling/importance_sampling_ratio/mean": 0.0,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 15.177563667297363,
"sampling/sampling_logp_difference/mean": 3.1732728481292725,
"step": 17,
"step_time": 22.080905613998766
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 10.08104912439982,
"epoch": 0.00013500135001350013,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 1.602857142857143e-05,
"loss": 0.0,
"step": 18,
"step_time": 1.857407227999829
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 1961.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 599.7083740234375,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 207.0,
"completions/min_terminated_length": 0.0,
"entropy": 9.975990613301596,
"epoch": 0.00014250142501425015,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 1.697142857142857e-05,
"loss": 0.0,
"num_tokens": 293166.0,
"reward": -48.2664909362793,
"reward_std": 27.311491012573242,
"rewards/rollout_reward_func/mean": -48.2664909362793,
"rewards/rollout_reward_func/std": 26.92763328552246,
"sampling/importance_sampling_ratio/max": 0.0,
"sampling/importance_sampling_ratio/mean": 0.0,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 13.656001091003418,
"sampling/sampling_logp_difference/mean": 3.007814407348633,
"step": 19,
"step_time": 21.696408987000723
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 9.975990613301596,
"epoch": 0.00015000150001500014,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 1.7914285714285715e-05,
"loss": 0.0,
"step": 20,
"step_time": 2.2191366179995384
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 1628.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 574.875,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 128.0,
"completions/min_terminated_length": 0.0,
"entropy": 10.216261068979898,
"epoch": 0.00015750157501575015,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 1.885714285714286e-05,
"loss": 0.0,
"num_tokens": 323248.0,
"reward": -47.44938659667969,
"reward_std": 18.540122985839844,
"rewards/rollout_reward_func/mean": -47.44938659667969,
"rewards/rollout_reward_func/std": 19.742595672607422,
"sampling/importance_sampling_ratio/max": 0.0,
"sampling/importance_sampling_ratio/mean": 0.0,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 15.002847671508789,
"sampling/sampling_logp_difference/mean": 3.182793617248535,
"step": 21,
"step_time": 21.242170976000125
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 10.216261068979898,
"epoch": 0.00016500165001650017,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 1.98e-05,
"loss": 0.0,
"step": 22,
"step_time": 1.98696188500071
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 1505.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 566.75,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 128.0,
"completions/min_terminated_length": 0.0,
"entropy": 10.105806827545166,
"epoch": 0.00017250172501725018,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 2.0742857142857145e-05,
"loss": 0.0,
"num_tokens": 353295.0,
"reward": -47.93909454345703,
"reward_std": 21.568241119384766,
"rewards/rollout_reward_func/mean": -47.9390983581543,
"rewards/rollout_reward_func/std": 21.119380950927734,
"sampling/importance_sampling_ratio/max": 0.0,
"sampling/importance_sampling_ratio/mean": 0.0,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 15.182104110717773,
"sampling/sampling_logp_difference/mean": 3.005905866622925,
"step": 23,
"step_time": 21.250609902998804
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 10.105806827545166,
"epoch": 0.00018000180001800017,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 2.1685714285714286e-05,
"loss": 0.0,
"step": 24,
"step_time": 1.9004714529983175
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 1371.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 490.4583435058594,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 119.0,
"completions/min_terminated_length": 0.0,
"entropy": 10.06377108891805,
"epoch": 0.00018750187501875019,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 2.262857142857143e-05,
"loss": 0.0,
"num_tokens": 381535.0,
"reward": -45.13935089111328,
"reward_std": 19.477066040039062,
"rewards/rollout_reward_func/mean": -45.13934326171875,
"rewards/rollout_reward_func/std": 20.111757278442383,
"sampling/importance_sampling_ratio/max": 0.0,
"sampling/importance_sampling_ratio/mean": 0.0,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 14.868183135986328,
"sampling/sampling_logp_difference/mean": 3.132169246673584,
"step": 25,
"step_time": 20.922161000000415
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 10.06377108891805,
"epoch": 0.0001950019500195002,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 2.3571428571428575e-05,
"loss": 0.0,
"step": 26,
"step_time": 1.8088220660001753
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.9583333730697632,
"completions/max_length": 1358.0,
"completions/max_terminated_length": 587.0,
"completions/mean_length": 520.7916870117188,
"completions/mean_terminated_length": 587.0,
"completions/min_length": 128.0,
"completions/min_terminated_length": 587.0,
"entropy": 10.113564809163412,
"epoch": 0.00020250202502025022,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 2.4514285714285716e-05,
"loss": 0.0,
"num_tokens": 410593.0,
"reward": -44.441925048828125,
"reward_std": 25.591054916381836,
"rewards/rollout_reward_func/mean": -44.441925048828125,
"rewards/rollout_reward_func/std": 26.473356246948242,
"sampling/importance_sampling_ratio/max": 0.0,
"sampling/importance_sampling_ratio/mean": 0.0,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 14.11213493347168,
"sampling/sampling_logp_difference/mean": 3.1286377906799316,
"step": 27,
"step_time": 21.701382299001125
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 10.113564809163412,
"epoch": 0.0002100021000210002,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 2.545714285714286e-05,
"loss": 0.0,
"step": 28,
"step_time": 1.7842601579986876
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 1249.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 547.2916870117188,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 128.0,
"completions/min_terminated_length": 0.0,
"entropy": 10.082143624623617,
"epoch": 0.00021750217502175022,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 2.6400000000000005e-05,
"loss": 0.0,
"num_tokens": 440182.0,
"reward": -45.103240966796875,
"reward_std": 21.529327392578125,
"rewards/rollout_reward_func/mean": -45.103240966796875,
"rewards/rollout_reward_func/std": 20.876861572265625,
"sampling/importance_sampling_ratio/max": 0.0,
"sampling/importance_sampling_ratio/mean": 0.0,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 14.8016996383667,
"sampling/sampling_logp_difference/mean": 3.1039717197418213,
"step": 29,
"step_time": 21.481158237000272
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 10.082143624623617,
"epoch": 0.00022500225002250023,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 2.7342857142857146e-05,
"loss": 0.0,
"step": 30,
"step_time": 2.1532514040018214
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.9583333730697632,
"completions/max_length": 1320.0,
"completions/max_terminated_length": 545.0,
"completions/mean_length": 434.9583435058594,
"completions/mean_terminated_length": 545.0,
"completions/min_length": 128.0,
"completions/min_terminated_length": 545.0,
"entropy": 10.205902258555094,
"epoch": 0.00023250232502325022,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 2.8285714285714287e-05,
"loss": 0.0,
"num_tokens": 467213.0,
"reward": -46.596885681152344,
"reward_std": 16.725446701049805,
"rewards/rollout_reward_func/mean": -46.59688186645508,
"rewards/rollout_reward_func/std": 16.85680389404297,
"sampling/importance_sampling_ratio/max": 0.0,
"sampling/importance_sampling_ratio/mean": 0.0,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 15.102117538452148,
"sampling/sampling_logp_difference/mean": 3.278193712234497,
"step": 31,
"step_time": 21.168742055999246
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 10.205902258555094,
"epoch": 0.00024000240002400024,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 2.9228571428571428e-05,
"loss": 0.0,
"step": 32,
"step_time": 1.750853970001117
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 1175.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 483.5,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 128.0,
"completions/min_terminated_length": 0.0,
"entropy": 10.10767682393392,
"epoch": 0.00024750247502475025,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 3.0171428571428572e-05,
"loss": 0.0,
"num_tokens": 494889.0,
"reward": -54.67100143432617,
"reward_std": 15.08906364440918,
"rewards/rollout_reward_func/mean": -54.67100143432617,
"rewards/rollout_reward_func/std": 14.835305213928223,
"sampling/importance_sampling_ratio/max": 0.0,
"sampling/importance_sampling_ratio/mean": 0.0,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 13.920793533325195,
"sampling/sampling_logp_difference/mean": 3.077173948287964,
"step": 33,
"step_time": 18.65274686000157
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 10.10767682393392,
"epoch": 0.00025500255002550024,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 3.1114285714285714e-05,
"loss": 0.0,
"step": 34,
"step_time": 1.6550644490016566
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 1209.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 526.3333740234375,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 128.0,
"completions/min_terminated_length": 0.0,
"entropy": 10.051785469055176,
"epoch": 0.0002625026250262503,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 3.205714285714286e-05,
"loss": 0.0,
"num_tokens": 524061.0,
"reward": -45.748435974121094,
"reward_std": 21.241294860839844,
"rewards/rollout_reward_func/mean": -45.74843978881836,
"rewards/rollout_reward_func/std": 21.725393295288086,
"sampling/importance_sampling_ratio/max": 0.0,
"sampling/importance_sampling_ratio/mean": 0.0,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 15.848544120788574,
"sampling/sampling_logp_difference/mean": 3.12214994430542,
"step": 35,
"step_time": 21.866697636000026
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 10.051785469055176,
"epoch": 0.00027000270002700027,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 3.3e-05,
"loss": 0.0,
"step": 36,
"step_time": 1.6678288799994334
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 1409.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 479.75,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 117.0,
"completions/min_terminated_length": 0.0,
"entropy": 10.13319698969523,
"epoch": 0.00027750277502775026,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 3.299999999986258e-05,
"loss": 0.0,
"num_tokens": 551502.0,
"reward": -49.105323791503906,
"reward_std": 17.612197875976562,
"rewards/rollout_reward_func/mean": -49.105316162109375,
"rewards/rollout_reward_func/std": 17.828832626342773,
"sampling/importance_sampling_ratio/max": 0.0,
"sampling/importance_sampling_ratio/mean": 0.0,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 14.797318458557129,
"sampling/sampling_logp_difference/mean": 3.0804805755615234,
"step": 37,
"step_time": 21.77108188600141
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 10.13319698969523,
"epoch": 0.0002850028500285003,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 3.299999999945032e-05,
"loss": 0.0,
"step": 38,
"step_time": 1.8092515669995919
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 1230.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 487.7083435058594,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 128.0,
"completions/min_terminated_length": 0.0,
"entropy": 10.18622620900472,
"epoch": 0.0002925029250292503,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 3.299999999876322e-05,
"loss": 0.0,
"num_tokens": 579575.0,
"reward": -52.96455001831055,
"reward_std": 15.875425338745117,
"rewards/rollout_reward_func/mean": -52.96455001831055,
"rewards/rollout_reward_func/std": 15.511474609375,
"sampling/importance_sampling_ratio/max": 0.0,
"sampling/importance_sampling_ratio/mean": 0.0,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 14.835306167602539,
"sampling/sampling_logp_difference/mean": 3.1505439281463623,
"step": 39,
"step_time": 17.71383904600225
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 10.18622620900472,
"epoch": 0.0003000030000300003,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 3.299999999780127e-05,
"loss": 0.0,
"step": 40,
"step_time": 1.6824162199991406
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 1092.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 420.2083435058594,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 128.0,
"completions/min_terminated_length": 0.0,
"entropy": 10.118754863739014,
"epoch": 0.0003075030750307503,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 3.299999999656449e-05,
"loss": 0.0,
"num_tokens": 606180.0,
"reward": -46.9930534362793,
"reward_std": 20.2467041015625,
"rewards/rollout_reward_func/mean": -46.9930534362793,
"rewards/rollout_reward_func/std": 19.9199161529541,
"sampling/importance_sampling_ratio/max": 0.0,
"sampling/importance_sampling_ratio/mean": 0.0,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 15.123610496520996,
"sampling/sampling_logp_difference/mean": 3.206780195236206,
"step": 41,
"step_time": 21.204631595998762
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 10.118754863739014,
"epoch": 0.0003150031500315003,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 3.299999999505286e-05,
"loss": 0.0,
"step": 42,
"step_time": 2.068030981999982
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 1052.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 458.91668701171875,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 199.0,
"completions/min_terminated_length": 0.0,
"entropy": 10.044801712036133,
"epoch": 0.00032250322503225035,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 3.2999999993266396e-05,
"loss": 0.0,
"num_tokens": 632958.0,
"reward": -49.451873779296875,
"reward_std": 19.586355209350586,
"rewards/rollout_reward_func/mean": -49.451873779296875,
"rewards/rollout_reward_func/std": 19.92502784729004,
"sampling/importance_sampling_ratio/max": 0.0,
"sampling/importance_sampling_ratio/mean": 0.0,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 14.691810607910156,
"sampling/sampling_logp_difference/mean": 3.104357957839966,
"step": 43,
"step_time": 20.662249461999636
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 10.044801712036133,
"epoch": 0.00033000330003300033,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 3.299999999120509e-05,
"loss": 0.0,
"step": 44,
"step_time": 1.6131189530015035
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 1517.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 572.25,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 128.0,
"completions/min_terminated_length": 0.0,
"entropy": 10.101013978322348,
"epoch": 0.0003375033750337503,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 3.299999998886893e-05,
"loss": 0.0,
"num_tokens": 662770.0,
"reward": -43.31492233276367,
"reward_std": 21.76721954345703,
"rewards/rollout_reward_func/mean": -43.31492233276367,
"rewards/rollout_reward_func/std": 21.667734146118164,
"sampling/importance_sampling_ratio/max": 0.0,
"sampling/importance_sampling_ratio/mean": 0.0,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 14.86938190460205,
"sampling/sampling_logp_difference/mean": 3.125654935836792,
"step": 45,
"step_time": 22.175719804000437
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 10.101013978322348,
"epoch": 0.00034500345003450036,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 3.299999998625794e-05,
"loss": 0.0,
"step": 46,
"step_time": 1.9093814330008172
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 1180.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 510.8333435058594,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 128.0,
"completions/min_terminated_length": 0.0,
"entropy": 10.153699239095053,
"epoch": 0.00035250352503525035,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 3.2999999983372116e-05,
"loss": 0.0,
"num_tokens": 691021.0,
"reward": -52.171714782714844,
"reward_std": 15.545321464538574,
"rewards/rollout_reward_func/mean": -52.17171096801758,
"rewards/rollout_reward_func/std": 17.01911735534668,
"sampling/importance_sampling_ratio/max": 0.0,
"sampling/importance_sampling_ratio/mean": 0.0,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 14.297968864440918,
"sampling/sampling_logp_difference/mean": 3.1078176498413086,
"step": 47,
"step_time": 18.611535334998734
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 10.153699239095053,
"epoch": 0.00036000360003600034,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 3.2999999980211436e-05,
"loss": 0.0,
"step": 48,
"step_time": 1.658304570000837
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 1470.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 560.8333740234375,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 128.0,
"completions/min_terminated_length": 0.0,
"entropy": 10.123283545176188,
"epoch": 0.0003675036750367504,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 3.2999999976775924e-05,
"loss": 0.0,
"num_tokens": 720151.0,
"reward": -46.37849044799805,
"reward_std": 20.132478713989258,
"rewards/rollout_reward_func/mean": -46.37849044799805,
"rewards/rollout_reward_func/std": 20.508827209472656,
"sampling/importance_sampling_ratio/max": 0.0,
"sampling/importance_sampling_ratio/mean": 0.0,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 14.414722442626953,
"sampling/sampling_logp_difference/mean": 3.121212959289551,
"step": 49,
"step_time": 20.764608049001254
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 10.123283545176188,
"epoch": 0.00037500375003750037,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 3.2999999973065575e-05,
"loss": 0.0,
"step": 50,
"step_time": 1.8636344930000632
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 1200.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 597.5,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 128.0,
"completions/min_terminated_length": 0.0,
"entropy": 10.12819274266561,
"epoch": 0.00038250382503825036,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 3.299999996908037e-05,
"loss": 0.0,
"num_tokens": 751114.0,
"reward": -45.675315856933594,
"reward_std": 19.463340759277344,
"rewards/rollout_reward_func/mean": -45.67531204223633,
"rewards/rollout_reward_func/std": 19.729412078857422,
"sampling/importance_sampling_ratio/max": 0.0,
"sampling/importance_sampling_ratio/mean": 0.0,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 14.647856712341309,
"sampling/sampling_logp_difference/mean": 3.114715337753296,
"step": 51,
"step_time": 21.510169729001063
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 10.12819274266561,
"epoch": 0.0003900039000390004,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 3.299999996482033e-05,
"loss": 0.0,
"step": 52,
"step_time": 1.669872710999698
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.9583333730697632,
"completions/max_length": 1084.0,
"completions/max_terminated_length": 273.0,
"completions/mean_length": 531.5833740234375,
"completions/mean_terminated_length": 273.0,
"completions/min_length": 128.0,
"completions/min_terminated_length": 273.0,
"entropy": 10.146964073181152,
"epoch": 0.0003975039750397504,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 3.299999996028545e-05,
"loss": 0.0,
"num_tokens": 779983.0,
"reward": -46.03662109375,
"reward_std": 19.839950561523438,
"rewards/rollout_reward_func/mean": -46.03662109375,
"rewards/rollout_reward_func/std": 21.031444549560547,
"sampling/importance_sampling_ratio/max": 0.0,
"sampling/importance_sampling_ratio/mean": 0.0,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 14.309167861938477,
"sampling/sampling_logp_difference/mean": 3.155137777328491,
"step": 53,
"step_time": 22.573047874000622
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 10.146964073181152,
"epoch": 0.00040500405004050043,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 3.2999999955475736e-05,
"loss": 0.0,
"step": 54,
"step_time": 1.612867594999443
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 1096.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 520.2083740234375,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 128.0,
"completions/min_terminated_length": 0.0,
"entropy": 10.004105885823568,
"epoch": 0.0004125041250412504,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 3.299999995039117e-05,
"loss": 0.0,
"num_tokens": 808253.0,
"reward": -50.46704864501953,
"reward_std": 17.44925308227539,
"rewards/rollout_reward_func/mean": -50.467044830322266,
"rewards/rollout_reward_func/std": 16.909671783447266,
"sampling/importance_sampling_ratio/max": 0.0,
"sampling/importance_sampling_ratio/mean": 0.0,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 13.991758346557617,
"sampling/sampling_logp_difference/mean": 3.0269718170166016,
"step": 55,
"step_time": 20.465181108998877
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 10.004105885823568,
"epoch": 0.0004200042000420004,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 3.299999994503177e-05,
"loss": 0.0,
"step": 56,
"step_time": 1.6323036710000451
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 1126.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 500.25,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 128.0,
"completions/min_terminated_length": 0.0,
"entropy": 10.095836321512857,
"epoch": 0.00042750427504275045,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 3.299999993939753e-05,
"loss": 0.0,
"num_tokens": 836831.0,
"reward": -47.74729537963867,
"reward_std": 17.828411102294922,
"rewards/rollout_reward_func/mean": -47.74729537963867,
"rewards/rollout_reward_func/std": 17.13072395324707,
"sampling/importance_sampling_ratio/max": 0.0,
"sampling/importance_sampling_ratio/mean": 0.0,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 15.644685745239258,
"sampling/sampling_logp_difference/mean": 3.2078018188476562,
"step": 57,
"step_time": 20.712939544999244
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 10.095836321512857,
"epoch": 0.00043500435004350044,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 3.299999993348844e-05,
"loss": 0.0,
"step": 58,
"step_time": 1.6430280170015976
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 1045.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 562.9166870117188,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 201.0,
"completions/min_terminated_length": 0.0,
"entropy": 10.027901649475098,
"epoch": 0.0004425044250442504,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 3.299999992730452e-05,
"loss": 0.0,
"num_tokens": 866235.0,
"reward": -54.31364059448242,
"reward_std": 16.991008758544922,
"rewards/rollout_reward_func/mean": -54.31363296508789,
"rewards/rollout_reward_func/std": 16.77975082397461,
"sampling/importance_sampling_ratio/max": 0.0,
"sampling/importance_sampling_ratio/mean": 0.0,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 15.359244346618652,
"sampling/sampling_logp_difference/mean": 2.992715835571289,
"step": 59,
"step_time": 18.364870030999555
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 10.027901649475098,
"epoch": 0.00045000450004500047,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 3.299999992084575e-05,
"loss": 0.0,
"step": 60,
"step_time": 1.6135677440006475
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 1237.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 583.125,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 116.0,
"completions/min_terminated_length": 0.0,
"entropy": 10.05492369333903,
"epoch": 0.00045750457504575045,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 3.299999991411214e-05,
"loss": 0.0,
"num_tokens": 896524.0,
"reward": -43.963714599609375,
"reward_std": 17.849592208862305,
"rewards/rollout_reward_func/mean": -43.963714599609375,
"rewards/rollout_reward_func/std": 19.37680435180664,
"sampling/importance_sampling_ratio/max": 0.0,
"sampling/importance_sampling_ratio/mean": 0.0,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 14.769742012023926,
"sampling/sampling_logp_difference/mean": 3.1108696460723877,
"step": 61,
"step_time": 23.026458983998964
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 10.05492369333903,
"epoch": 0.00046500465004650044,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 3.299999990710369e-05,
"loss": 0.0,
"step": 62,
"step_time": 1.680401769999662
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 1481.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 542.0833740234375,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 128.0,
"completions/min_terminated_length": 0.0,
"entropy": 9.974092801411947,
"epoch": 0.0004725047250472505,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 3.2999999899820406e-05,
"loss": 0.0,
"num_tokens": 925351.0,
"reward": -47.469757080078125,
"reward_std": 16.11382484436035,
"rewards/rollout_reward_func/mean": -47.469757080078125,
"rewards/rollout_reward_func/std": 16.915918350219727,
"sampling/importance_sampling_ratio/max": 0.0,
"sampling/importance_sampling_ratio/mean": 0.0,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 14.969949722290039,
"sampling/sampling_logp_difference/mean": 3.0506622791290283,
"step": 63,
"step_time": 22.066013471999213
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 9.974092801411947,
"epoch": 0.00048000480004800047,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 3.299999989226227e-05,
"loss": 0.0,
"step": 64,
"step_time": 2.3155115009976726
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 1072.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 535.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 128.0,
"completions/min_terminated_length": 0.0,
"entropy": 10.121665000915527,
"epoch": 0.0004875048750487505,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 3.2999999884429296e-05,
"loss": 0.0,
"num_tokens": 954640.0,
"reward": -47.75408172607422,
"reward_std": 16.36182975769043,
"rewards/rollout_reward_func/mean": -47.75407791137695,
"rewards/rollout_reward_func/std": 19.085969924926758,
"sampling/importance_sampling_ratio/max": 0.0,
"sampling/importance_sampling_ratio/mean": 0.0,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 14.185979843139648,
"sampling/sampling_logp_difference/mean": 3.126000165939331,
"step": 65,
"step_time": 19.784562730999824
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 10.121665000915527,
"epoch": 0.0004950049500495005,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 3.299999987632149e-05,
"loss": 0.0,
"step": 66,
"step_time": 1.618170978001217
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 1122.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 500.8333435058594,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 128.0,
"completions/min_terminated_length": 0.0,
"entropy": 10.120317935943604,
"epoch": 0.0005025050250502505,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 3.299999986793883e-05,
"loss": 0.0,
"num_tokens": 982892.0,
"reward": -53.79902267456055,
"reward_std": 15.497881889343262,
"rewards/rollout_reward_func/mean": -53.79902267456055,
"rewards/rollout_reward_func/std": 17.754192352294922,
"sampling/importance_sampling_ratio/max": 0.0,
"sampling/importance_sampling_ratio/mean": 0.0,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 13.600709915161133,
"sampling/sampling_logp_difference/mean": 3.1041884422302246,
"step": 67,
"step_time": 19.112115680999523
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 10.120317935943604,
"epoch": 0.0005100051000510005,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 3.2999999859281335e-05,
"loss": 0.0,
"step": 68,
"step_time": 1.6533779979999963
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 1001.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 545.875,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 202.0,
"completions/min_terminated_length": 0.0,
"entropy": 9.890536467234293,
"epoch": 0.0005175051750517505,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 3.2999999850349e-05,
"loss": 0.0,
"num_tokens": 1012345.0,
"reward": -59.80120849609375,
"reward_std": 9.581160545349121,
"rewards/rollout_reward_func/mean": -59.80120849609375,
"rewards/rollout_reward_func/std": 10.057110786437988,
"sampling/importance_sampling_ratio/max": 0.0,
"sampling/importance_sampling_ratio/mean": 0.0,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 13.439180374145508,
"sampling/sampling_logp_difference/mean": 2.835329532623291,
"step": 69,
"step_time": 18.08387734600001
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 9.890536467234293,
"epoch": 0.0005250052500525006,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 3.299999984114182e-05,
"loss": 0.0,
"step": 70,
"step_time": 1.6130178840003282
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 1181.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 559.75,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 128.0,
"completions/min_terminated_length": 0.0,
"entropy": 9.979540348052979,
"epoch": 0.0005325053250532505,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 3.29999998316598e-05,
"loss": 0.0,
"num_tokens": 1041646.0,
"reward": -48.28936004638672,
"reward_std": 18.08314323425293,
"rewards/rollout_reward_func/mean": -48.28935623168945,
"rewards/rollout_reward_func/std": 17.737323760986328,
"sampling/importance_sampling_ratio/max": 0.0,
"sampling/importance_sampling_ratio/mean": 0.0,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 14.49035930633545,
"sampling/sampling_logp_difference/mean": 3.0069854259490967,
"step": 71,
"step_time": 20.73087897000096
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 9.979540348052979,
"epoch": 0.0005400054000540005,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 3.2999999821902936e-05,
"loss": 0.0,
"step": 72,
"step_time": 1.673831482999958
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 1129.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 479.875,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 125.0,
"completions/min_terminated_length": 0.0,
"entropy": 9.990771611531576,
"epoch": 0.0005475054750547505,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 3.2999999811871234e-05,
"loss": 0.0,
"num_tokens": 1069537.0,
"reward": -46.58069610595703,
"reward_std": 21.29560661315918,
"rewards/rollout_reward_func/mean": -46.5806884765625,
"rewards/rollout_reward_func/std": 21.10353660583496,
"sampling/importance_sampling_ratio/max": 0.0,
"sampling/importance_sampling_ratio/mean": 0.0,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 15.358684539794922,
"sampling/sampling_logp_difference/mean": 3.0600218772888184,
"step": 73,
"step_time": 20.745255362002354
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 9.990771611531576,
"epoch": 0.0005550055500555005,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 3.299999980156469e-05,
"loss": 0.0,
"step": 74,
"step_time": 1.65707921000012
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 1571.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 533.5833740234375,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 128.0,
"completions/min_terminated_length": 0.0,
"entropy": 10.04587491353353,
"epoch": 0.0005625056250562506,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 3.2999999790983306e-05,
"loss": 0.0,
"num_tokens": 1098538.0,
"reward": -43.66608428955078,
"reward_std": 16.150455474853516,
"rewards/rollout_reward_func/mean": -43.66608810424805,
"rewards/rollout_reward_func/std": 18.750696182250977,
"sampling/importance_sampling_ratio/max": 0.0,
"sampling/importance_sampling_ratio/mean": 0.0,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 15.10157299041748,
"sampling/sampling_logp_difference/mean": 3.069899320602417,
"step": 75,
"step_time": 23.273216087998662
}
],
"logging_steps": 1.0,
"max_steps": 666660,
"num_input_tokens_seen": 1098538,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}