smk-ld / trainer_state.json
Masnuy's picture
Upload task output 1
e01b768 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.00055,
"eval_steps": 500,
"global_step": 55,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2.0,
"completions/max_terminated_length": 2.0,
"completions/mean_length": 2.0,
"completions/mean_terminated_length": 2.0,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 8.940144538879395,
"epoch": 1e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.03423422574996948,
"kl": 0.0,
"learning_rate": 0.0,
"loss": -0.0013,
"num_tokens": 35616.0,
"reward": -0.7051675319671631,
"reward_std": 0.7764065265655518,
"rewards/rollout_reward_func/mean": -0.7051675319671631,
"rewards/rollout_reward_func/std": 0.75037682056427,
"sampling/importance_sampling_ratio/max": 0.06733503937721252,
"sampling/importance_sampling_ratio/mean": 0.035891756415367126,
"sampling/importance_sampling_ratio/min": 0.012922381982207298,
"sampling/sampling_logp_difference/max": 2.4574475288391113,
"sampling/sampling_logp_difference/mean": 1.7373101711273193,
"step": 1,
"step_time": 6.607899043003272
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 8.940144538879395,
"epoch": 2e-05,
"grad_norm": 0.03577549755573273,
"kl": 0.0,
"learning_rate": 2.8571428571428575e-07,
"loss": -0.0013,
"step": 2,
"step_time": 2.9063545979988703
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2.0,
"completions/max_terminated_length": 2.0,
"completions/mean_length": 2.0,
"completions/mean_terminated_length": 2.0,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 8.965680599212646,
"epoch": 3e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.017016781494021416,
"kl": 0.0007822737097740173,
"learning_rate": 5.714285714285715e-07,
"loss": -0.0006,
"num_tokens": 71095.0,
"reward": -0.9110076427459717,
"reward_std": 0.6931561231613159,
"rewards/rollout_reward_func/mean": -0.9110076427459717,
"rewards/rollout_reward_func/std": 0.6800154447555542,
"sampling/importance_sampling_ratio/max": 0.06864165514707565,
"sampling/importance_sampling_ratio/mean": 0.03215230628848076,
"sampling/importance_sampling_ratio/min": 0.011430883780121803,
"sampling/sampling_logp_difference/max": 2.474456548690796,
"sampling/sampling_logp_difference/mean": 1.8041703701019287,
"step": 3,
"step_time": 5.5894952089984145
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 8.965598821640015,
"epoch": 4e-05,
"grad_norm": 0.01733771711587906,
"kl": 0.0007491949945688248,
"learning_rate": 8.571428571428572e-07,
"loss": -0.0006,
"step": 4,
"step_time": 3.4044442560007155
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 16.0,
"completions/max_terminated_length": 5.0,
"completions/mean_length": 2.53125,
"completions/mean_terminated_length": 2.096774101257324,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 8.834780097007751,
"epoch": 5e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.02108524739742279,
"kl": 0.0009654137102188542,
"learning_rate": 1.142857142857143e-06,
"loss": -0.0003,
"num_tokens": 106490.0,
"reward": -0.5540984869003296,
"reward_std": 0.8771607279777527,
"rewards/rollout_reward_func/mean": -0.5540984869003296,
"rewards/rollout_reward_func/std": 0.8618184924125671,
"sampling/importance_sampling_ratio/max": 0.07213470339775085,
"sampling/importance_sampling_ratio/mean": 0.03297191113233566,
"sampling/importance_sampling_ratio/min": 3.0050444771445584e-11,
"sampling/sampling_logp_difference/max": 4.576776504516602,
"sampling/sampling_logp_difference/mean": 1.773134469985962,
"step": 5,
"step_time": 6.008008040997083
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 8.827986240386963,
"epoch": 6e-05,
"grad_norm": 0.021368548274040222,
"kl": 0.0009469666983932257,
"learning_rate": 1.4285714285714286e-06,
"loss": -0.0004,
"step": 6,
"step_time": 2.88994878000085
},
{
"clip_ratio/high_max": 0.03125,
"clip_ratio/high_mean": 0.015625,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015625,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 16.0,
"completions/max_terminated_length": 2.0,
"completions/mean_length": 2.875,
"completions/mean_terminated_length": 2.0,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 8.796356916427612,
"epoch": 7e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.013663483783602715,
"kl": 0.0008234605193138123,
"learning_rate": 1.7142857142857145e-06,
"loss": 0.0,
"num_tokens": 142069.0,
"reward": -0.8088920712471008,
"reward_std": 0.7424027323722839,
"rewards/rollout_reward_func/mean": -0.8088920712471008,
"rewards/rollout_reward_func/std": 0.7662962675094604,
"sampling/importance_sampling_ratio/max": 0.057457707822322845,
"sampling/importance_sampling_ratio/mean": 0.02730659209191799,
"sampling/importance_sampling_ratio/min": 7.280681058041694e-10,
"sampling/sampling_logp_difference/max": 4.222927093505859,
"sampling/sampling_logp_difference/mean": 1.6366889476776123,
"step": 7,
"step_time": 5.921918200005166
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 8.786743104457855,
"epoch": 8e-05,
"grad_norm": 0.013285573571920395,
"kl": 0.0009508101793471724,
"learning_rate": 2.0000000000000003e-06,
"loss": -0.0,
"step": 8,
"step_time": 2.9387520060008683
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2.0,
"completions/max_terminated_length": 2.0,
"completions/mean_length": 2.0,
"completions/mean_terminated_length": 2.0,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 8.922944903373718,
"epoch": 9e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.01735098287463188,
"kl": 0.0008866805583238602,
"learning_rate": 2.285714285714286e-06,
"loss": -0.0002,
"num_tokens": 176547.0,
"reward": -0.618694543838501,
"reward_std": 0.8990023136138916,
"rewards/rollout_reward_func/mean": -0.618694543838501,
"rewards/rollout_reward_func/std": 0.8754127621650696,
"sampling/importance_sampling_ratio/max": 0.06334654986858368,
"sampling/importance_sampling_ratio/mean": 0.03222377225756645,
"sampling/importance_sampling_ratio/min": 0.011594683863222599,
"sampling/sampling_logp_difference/max": 2.4042437076568604,
"sampling/sampling_logp_difference/mean": 1.7828483581542969,
"step": 9,
"step_time": 5.69735375300661
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 8.913637280464172,
"epoch": 0.0001,
"grad_norm": 0.017596419900655746,
"kl": 0.000972965732216835,
"learning_rate": 2.571428571428571e-06,
"loss": -0.0002,
"step": 10,
"step_time": 3.580516988000454
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 5.0,
"completions/max_terminated_length": 5.0,
"completions/mean_length": 2.15625,
"completions/mean_terminated_length": 2.15625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 8.979163527488708,
"epoch": 0.00011,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.01395466923713684,
"kl": 0.0012971882097190246,
"learning_rate": 2.8571428571428573e-06,
"loss": -0.0002,
"num_tokens": 210671.0,
"reward": -0.6838527917861938,
"reward_std": 0.7062864899635315,
"rewards/rollout_reward_func/mean": -0.6838527917861938,
"rewards/rollout_reward_func/std": 0.7574694752693176,
"sampling/importance_sampling_ratio/max": 0.06857945024967194,
"sampling/importance_sampling_ratio/mean": 0.03003668040037155,
"sampling/importance_sampling_ratio/min": 7.147054475353798e-06,
"sampling/sampling_logp_difference/max": 4.250937461853027,
"sampling/sampling_logp_difference/mean": 1.8635720014572144,
"step": 11,
"step_time": 6.033825367005193
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 8.966804146766663,
"epoch": 0.00012,
"grad_norm": 0.01391494832932949,
"kl": 0.0018893439264502376,
"learning_rate": 3.142857142857143e-06,
"loss": -0.0002,
"step": 12,
"step_time": 2.8316939499891305
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 5.0,
"completions/max_terminated_length": 5.0,
"completions/mean_length": 2.28125,
"completions/mean_terminated_length": 2.28125,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 8.810548543930054,
"epoch": 0.00013,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.014992384240031242,
"kl": 0.003853602087474428,
"learning_rate": 3.428571428571429e-06,
"loss": 0.0001,
"num_tokens": 245676.0,
"reward": -0.6364654302597046,
"reward_std": 0.7521181106567383,
"rewards/rollout_reward_func/mean": -0.6364654302597046,
"rewards/rollout_reward_func/std": 0.7526334524154663,
"sampling/importance_sampling_ratio/max": 0.06722358614206314,
"sampling/importance_sampling_ratio/mean": 0.03307785466313362,
"sampling/importance_sampling_ratio/min": 3.5045477488893084e-06,
"sampling/sampling_logp_difference/max": 4.873165607452393,
"sampling/sampling_logp_difference/mean": 1.8621257543563843,
"step": 13,
"step_time": 5.768471701994713
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 8.802866578102112,
"epoch": 0.00014,
"grad_norm": 0.014974371530115604,
"kl": 0.004468549799639732,
"learning_rate": 3.7142857142857146e-06,
"loss": 0.0001,
"step": 14,
"step_time": 2.8839251570025226
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 16.0,
"completions/max_terminated_length": 2.0,
"completions/mean_length": 2.4375,
"completions/mean_terminated_length": 2.0,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 8.717366337776184,
"epoch": 0.00015,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.020232753828167915,
"kl": 0.004971407979610376,
"learning_rate": 4.000000000000001e-06,
"loss": -0.0008,
"num_tokens": 282194.0,
"reward": -0.7452265024185181,
"reward_std": 0.7260236144065857,
"rewards/rollout_reward_func/mean": -0.7452265024185181,
"rewards/rollout_reward_func/std": 0.7854404449462891,
"sampling/importance_sampling_ratio/max": 0.084043949842453,
"sampling/importance_sampling_ratio/mean": 0.03686349838972092,
"sampling/importance_sampling_ratio/min": 9.963324609785218e-10,
"sampling/sampling_logp_difference/max": 3.4498603343963623,
"sampling/sampling_logp_difference/mean": 1.676363468170166,
"step": 15,
"step_time": 5.806083219005814
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 8.69223439693451,
"epoch": 0.00016,
"grad_norm": 0.020264672115445137,
"kl": 0.005897294729948044,
"learning_rate": 4.2857142857142855e-06,
"loss": -0.0008,
"step": 16,
"step_time": 3.649606159000541
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 8.0,
"completions/max_terminated_length": 8.0,
"completions/mean_length": 2.1875,
"completions/mean_terminated_length": 2.1875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 8.732311606407166,
"epoch": 0.00017,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.019782064482569695,
"kl": 0.007078325026668608,
"learning_rate": 4.571428571428572e-06,
"loss": -0.0,
"num_tokens": 317749.0,
"reward": -0.5659611821174622,
"reward_std": 0.7136144042015076,
"rewards/rollout_reward_func/mean": -0.5659611821174622,
"rewards/rollout_reward_func/std": 0.7692865133285522,
"sampling/importance_sampling_ratio/max": 0.08927696198225021,
"sampling/importance_sampling_ratio/mean": 0.034128978848457336,
"sampling/importance_sampling_ratio/min": 6.115115684224293e-05,
"sampling/sampling_logp_difference/max": 2.444645404815674,
"sampling/sampling_logp_difference/mean": 1.729607105255127,
"step": 17,
"step_time": 6.096930697000062
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 8.71469098329544,
"epoch": 0.00018,
"grad_norm": 0.0198439322412014,
"kl": 0.00980698294006288,
"learning_rate": 4.857142857142858e-06,
"loss": -0.0001,
"step": 18,
"step_time": 2.8486052290027146
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 13.0,
"completions/max_terminated_length": 13.0,
"completions/mean_length": 2.34375,
"completions/mean_terminated_length": 2.34375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 8.700989127159119,
"epoch": 0.00019,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.02092764899134636,
"kl": 0.016879421891644597,
"learning_rate": 5.142857142857142e-06,
"loss": -0.0005,
"num_tokens": 353593.0,
"reward": -0.5766444802284241,
"reward_std": 0.8734984397888184,
"rewards/rollout_reward_func/mean": -0.5766444802284241,
"rewards/rollout_reward_func/std": 0.8666929602622986,
"sampling/importance_sampling_ratio/max": 0.10328938066959381,
"sampling/importance_sampling_ratio/mean": 0.0412919819355011,
"sampling/importance_sampling_ratio/min": 8.264829792770101e-11,
"sampling/sampling_logp_difference/max": 3.909327507019043,
"sampling/sampling_logp_difference/mean": 1.7047920227050781,
"step": 19,
"step_time": 5.767663798993453
},
{
"clip_ratio/high_max": 0.03125,
"clip_ratio/high_mean": 0.015625,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015625,
"entropy": 8.635276675224304,
"epoch": 0.0002,
"grad_norm": 0.02117123454809189,
"kl": 0.022729096352122724,
"learning_rate": 5.428571428571429e-06,
"loss": -0.0005,
"step": 20,
"step_time": 2.8989755920047173
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 7.0,
"completions/max_terminated_length": 7.0,
"completions/mean_length": 2.15625,
"completions/mean_terminated_length": 2.15625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 8.578810691833496,
"epoch": 0.00021,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.044351667165756226,
"kl": 0.03346684481948614,
"learning_rate": 5.7142857142857145e-06,
"loss": -0.0024,
"num_tokens": 388691.0,
"reward": -0.6427146196365356,
"reward_std": 0.8122553825378418,
"rewards/rollout_reward_func/mean": -0.6427146196365356,
"rewards/rollout_reward_func/std": 0.7960423827171326,
"sampling/importance_sampling_ratio/max": 0.10920954495668411,
"sampling/importance_sampling_ratio/mean": 0.04724588990211487,
"sampling/importance_sampling_ratio/min": 2.8349152216833318e-06,
"sampling/sampling_logp_difference/max": 3.772367477416992,
"sampling/sampling_logp_difference/mean": 1.6777459383010864,
"step": 21,
"step_time": 5.727157995002926
},
{
"clip_ratio/high_max": 0.03125,
"clip_ratio/high_mean": 0.015625,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015625,
"entropy": 8.421014785766602,
"epoch": 0.00022,
"grad_norm": 0.044636089354753494,
"kl": 0.047105960082262754,
"learning_rate": 6e-06,
"loss": -0.0026,
"step": 22,
"step_time": 4.04690631700214
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2.0,
"completions/max_terminated_length": 2.0,
"completions/mean_length": 2.0,
"completions/mean_terminated_length": 2.0,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 8.332530975341797,
"epoch": 0.00023,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0429239459335804,
"kl": 0.07239408232271671,
"learning_rate": 6.285714285714286e-06,
"loss": -0.0028,
"num_tokens": 424016.0,
"reward": -0.6825613975524902,
"reward_std": 0.8769230246543884,
"rewards/rollout_reward_func/mean": -0.6825613975524902,
"rewards/rollout_reward_func/std": 0.852479875087738,
"sampling/importance_sampling_ratio/max": 0.14365191757678986,
"sampling/importance_sampling_ratio/mean": 0.05794315040111542,
"sampling/importance_sampling_ratio/min": 0.008735693991184235,
"sampling/sampling_logp_difference/max": 2.5880439281463623,
"sampling/sampling_logp_difference/mean": 1.6414165496826172,
"step": 23,
"step_time": 5.61804673000006
},
{
"clip_ratio/high_max": 0.21875,
"clip_ratio/high_mean": 0.109375,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.109375,
"entropy": 8.144118428230286,
"epoch": 0.00024,
"grad_norm": 0.0180932879447937,
"kl": 0.0962864700704813,
"learning_rate": 6.571428571428572e-06,
"loss": -0.0031,
"step": 24,
"step_time": 2.8988405260024592
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 5.0,
"completions/max_terminated_length": 5.0,
"completions/mean_length": 2.1875,
"completions/mean_terminated_length": 2.1875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 8.125683069229126,
"epoch": 0.00025,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.03623996675014496,
"kl": 0.10392077919095755,
"learning_rate": 6.857142857142858e-06,
"loss": -0.0041,
"num_tokens": 459589.0,
"reward": -0.61258465051651,
"reward_std": 0.871542751789093,
"rewards/rollout_reward_func/mean": -0.61258465051651,
"rewards/rollout_reward_func/std": 0.8524011969566345,
"sampling/importance_sampling_ratio/max": 0.16312259435653687,
"sampling/importance_sampling_ratio/mean": 0.06305442750453949,
"sampling/importance_sampling_ratio/min": 1.7614916032471228e-06,
"sampling/sampling_logp_difference/max": 4.772340774536133,
"sampling/sampling_logp_difference/mean": 1.7246109247207642,
"step": 25,
"step_time": 5.539267299005587
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 7.976094305515289,
"epoch": 0.00026,
"grad_norm": 0.030301710590720177,
"kl": 0.13206800539046526,
"learning_rate": 7.1428571428571436e-06,
"loss": -0.0045,
"step": 26,
"step_time": 2.896310984997399
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 16.0,
"completions/max_terminated_length": 2.0,
"completions/mean_length": 2.4375,
"completions/mean_terminated_length": 2.0,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 7.269331395626068,
"epoch": 0.00027,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.022369084879755974,
"kl": 0.16119840927422047,
"learning_rate": 7.428571428571429e-06,
"loss": -0.0036,
"num_tokens": 496650.0,
"reward": -0.7243883013725281,
"reward_std": 0.7688334584236145,
"rewards/rollout_reward_func/mean": -0.7243883013725281,
"rewards/rollout_reward_func/std": 0.7527879476547241,
"sampling/importance_sampling_ratio/max": 0.18785437941551208,
"sampling/importance_sampling_ratio/mean": 0.10117587447166443,
"sampling/importance_sampling_ratio/min": 8.512477528421769e-11,
"sampling/sampling_logp_difference/max": 4.909823417663574,
"sampling/sampling_logp_difference/mean": 1.4340462684631348,
"step": 27,
"step_time": 6.4576256859945715
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 7.120794892311096,
"epoch": 0.00028,
"grad_norm": 0.02468658983707428,
"kl": 0.182576522231102,
"learning_rate": 7.714285714285716e-06,
"loss": -0.0038,
"step": 28,
"step_time": 3.5662226489985187
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2.0,
"completions/max_terminated_length": 2.0,
"completions/mean_length": 2.0,
"completions/mean_terminated_length": 2.0,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 7.656641006469727,
"epoch": 0.00029,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.016346270218491554,
"kl": 0.1884312927722931,
"learning_rate": 8.000000000000001e-06,
"loss": -0.0025,
"num_tokens": 531329.0,
"reward": -0.6714671850204468,
"reward_std": 0.8514942526817322,
"rewards/rollout_reward_func/mean": -0.6714671850204468,
"rewards/rollout_reward_func/std": 0.8725821375846863,
"sampling/importance_sampling_ratio/max": 0.2034609168767929,
"sampling/importance_sampling_ratio/mean": 0.0898696631193161,
"sampling/importance_sampling_ratio/min": 0.008383152075111866,
"sampling/sampling_logp_difference/max": 2.7939882278442383,
"sampling/sampling_logp_difference/mean": 1.5407953262329102,
"step": 29,
"step_time": 5.637962408003659
},
{
"clip_ratio/high_max": 0.0625,
"clip_ratio/high_mean": 0.046875,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.046875,
"entropy": 7.5971901416778564,
"epoch": 0.0003,
"grad_norm": 0.013556623831391335,
"kl": 0.20893656089901924,
"learning_rate": 8.285714285714287e-06,
"loss": -0.0026,
"step": 30,
"step_time": 2.9192343930008064
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9.0,
"completions/max_terminated_length": 9.0,
"completions/mean_length": 2.21875,
"completions/mean_terminated_length": 2.21875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 6.758796453475952,
"epoch": 0.00031,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.019501112401485443,
"kl": 0.3079346362501383,
"learning_rate": 8.571428571428571e-06,
"loss": -0.0031,
"num_tokens": 567805.0,
"reward": -0.568469762802124,
"reward_std": 0.8567708730697632,
"rewards/rollout_reward_func/mean": -0.568469762802124,
"rewards/rollout_reward_func/std": 0.8660122752189636,
"sampling/importance_sampling_ratio/max": 0.22180257737636566,
"sampling/importance_sampling_ratio/mean": 0.12508273124694824,
"sampling/importance_sampling_ratio/min": 3.750224089604792e-11,
"sampling/sampling_logp_difference/max": 5.136954307556152,
"sampling/sampling_logp_difference/mean": 1.4436562061309814,
"step": 31,
"step_time": 5.43906901200171
},
{
"clip_ratio/high_max": 0.0625,
"clip_ratio/high_mean": 0.03125,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0625,
"entropy": 6.6603924036026,
"epoch": 0.00032,
"grad_norm": 0.01703478768467903,
"kl": 0.3732527755200863,
"learning_rate": 8.857142857142858e-06,
"loss": -0.0032,
"step": 32,
"step_time": 2.930219074998604
},
{
"clip_ratio/high_max": 0.03125,
"clip_ratio/high_mean": 0.015625,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2.0,
"completions/max_terminated_length": 2.0,
"completions/mean_length": 2.0,
"completions/mean_terminated_length": 2.0,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 6.725336015224457,
"epoch": 0.00033,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.016738811507821083,
"kl": 0.27567504718899727,
"learning_rate": 9.142857142857144e-06,
"loss": -0.0019,
"num_tokens": 603268.0,
"reward": -0.30585941672325134,
"reward_std": 0.6699719429016113,
"rewards/rollout_reward_func/mean": -0.30585941672325134,
"rewards/rollout_reward_func/std": 0.6897762417793274,
"sampling/importance_sampling_ratio/max": 0.2360040694475174,
"sampling/importance_sampling_ratio/mean": 0.1374823898077011,
"sampling/importance_sampling_ratio/min": 0.006810983642935753,
"sampling/sampling_logp_difference/max": 3.1095614433288574,
"sampling/sampling_logp_difference/mean": 1.2982618808746338,
"step": 33,
"step_time": 6.234361916005582
},
{
"clip_ratio/high_max": 0.09375,
"clip_ratio/high_mean": 0.046875,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.046875,
"entropy": 6.653369903564453,
"epoch": 0.00034,
"grad_norm": 0.01500980369746685,
"kl": 0.29385758377611637,
"learning_rate": 9.42857142857143e-06,
"loss": -0.0019,
"step": 34,
"step_time": 2.948668930999702
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 5.0,
"completions/max_terminated_length": 5.0,
"completions/mean_length": 2.09375,
"completions/mean_terminated_length": 2.09375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 6.599472224712372,
"epoch": 0.00035,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.018205825239419937,
"kl": 0.35661908239126205,
"learning_rate": 9.714285714285715e-06,
"loss": -0.0049,
"num_tokens": 639208.0,
"reward": -0.5350777506828308,
"reward_std": 0.7106601595878601,
"rewards/rollout_reward_func/mean": -0.5350777506828308,
"rewards/rollout_reward_func/std": 0.7991757392883301,
"sampling/importance_sampling_ratio/max": 0.25684407353401184,
"sampling/importance_sampling_ratio/mean": 0.1485980749130249,
"sampling/importance_sampling_ratio/min": 3.692734389915131e-05,
"sampling/sampling_logp_difference/max": 4.381838321685791,
"sampling/sampling_logp_difference/mean": 1.253082275390625,
"step": 35,
"step_time": 5.500268681997113
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 6.5274258852005005,
"epoch": 0.00036,
"grad_norm": 0.026637688279151917,
"kl": 0.36279567517340183,
"learning_rate": 1e-05,
"loss": -0.0049,
"step": 36,
"step_time": 2.9163373929950467
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2.0,
"completions/max_terminated_length": 2.0,
"completions/mean_length": 2.0,
"completions/mean_terminated_length": 2.0,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 6.327381074428558,
"epoch": 0.00037,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.03791587054729462,
"kl": 0.4374086819589138,
"learning_rate": 9.999999999962232e-06,
"loss": -0.0034,
"num_tokens": 675224.0,
"reward": -0.42839378118515015,
"reward_std": 0.7165933847427368,
"rewards/rollout_reward_func/mean": -0.42839378118515015,
"rewards/rollout_reward_func/std": 0.6934623122215271,
"sampling/importance_sampling_ratio/max": 0.2750149071216583,
"sampling/importance_sampling_ratio/mean": 0.16812871396541595,
"sampling/importance_sampling_ratio/min": 0.005278678145259619,
"sampling/sampling_logp_difference/max": 3.2639646530151367,
"sampling/sampling_logp_difference/mean": 1.122904896736145,
"step": 37,
"step_time": 5.68354233199716
},
{
"clip_ratio/high_max": 0.09375,
"clip_ratio/high_mean": 0.046875,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.046875,
"entropy": 6.15696656703949,
"epoch": 0.00038,
"grad_norm": 0.01739896647632122,
"kl": 0.46510135009884834,
"learning_rate": 9.999999999848919e-06,
"loss": -0.0035,
"step": 38,
"step_time": 2.9220271470039734
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 16.0,
"completions/max_terminated_length": 8.0,
"completions/mean_length": 2.625,
"completions/mean_terminated_length": 2.1935482025146484,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 6.091022729873657,
"epoch": 0.00039,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0197161715477705,
"kl": 0.42657361552119255,
"learning_rate": 9.99999999966007e-06,
"loss": -0.0024,
"num_tokens": 710988.0,
"reward": -0.3023349940776825,
"reward_std": 0.6465471386909485,
"rewards/rollout_reward_func/mean": -0.3023349940776825,
"rewards/rollout_reward_func/std": 0.6331813335418701,
"sampling/importance_sampling_ratio/max": 0.2962448298931122,
"sampling/importance_sampling_ratio/mean": 0.18444794416427612,
"sampling/importance_sampling_ratio/min": 4.504835306867738e-12,
"sampling/sampling_logp_difference/max": 4.963308334350586,
"sampling/sampling_logp_difference/mean": 1.1856834888458252,
"step": 39,
"step_time": 7.029601453006762
},
{
"clip_ratio/high_max": 0.1319444444961846,
"clip_ratio/high_mean": 0.07847222150303423,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.07847222150303423,
"entropy": 5.928341567516327,
"epoch": 0.0004,
"grad_norm": 0.028808562085032463,
"kl": 0.44897962361574173,
"learning_rate": 9.99999999939568e-06,
"loss": -0.0025,
"step": 40,
"step_time": 2.9406937890053086
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2.0,
"completions/max_terminated_length": 2.0,
"completions/mean_length": 2.0,
"completions/mean_terminated_length": 2.0,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 6.25755649805069,
"epoch": 0.00041,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0933663472533226,
"kl": 0.48784746043384075,
"learning_rate": 9.999999999055747e-06,
"loss": 0.0029,
"num_tokens": 745636.0,
"reward": -0.19651329517364502,
"reward_std": 0.5318358540534973,
"rewards/rollout_reward_func/mean": -0.19651329517364502,
"rewards/rollout_reward_func/std": 0.5945489406585693,
"sampling/importance_sampling_ratio/max": 0.31440603733062744,
"sampling/importance_sampling_ratio/mean": 0.18640094995498657,
"sampling/importance_sampling_ratio/min": 0.011243580840528011,
"sampling/sampling_logp_difference/max": 2.6481189727783203,
"sampling/sampling_logp_difference/mean": 1.0152370929718018,
"step": 41,
"step_time": 5.63657486000011
},
{
"clip_ratio/high_max": 0.1875,
"clip_ratio/high_mean": 0.09375,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.09375,
"entropy": 5.96447890996933,
"epoch": 0.00042,
"grad_norm": 0.02206423319876194,
"kl": 0.5289704687893391,
"learning_rate": 9.999999998640277e-06,
"loss": 0.0027,
"step": 42,
"step_time": 2.8975234469944553
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2.0,
"completions/max_terminated_length": 2.0,
"completions/mean_length": 2.0,
"completions/mean_terminated_length": 2.0,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 5.439069867134094,
"epoch": 0.00043,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.11297624558210373,
"kl": 0.4683471880853176,
"learning_rate": 9.999999998149264e-06,
"loss": 0.0006,
"num_tokens": 781581.0,
"reward": -0.44622302055358887,
"reward_std": 0.68892902135849,
"rewards/rollout_reward_func/mean": -0.44622302055358887,
"rewards/rollout_reward_func/std": 0.7478122711181641,
"sampling/importance_sampling_ratio/max": 0.3280465304851532,
"sampling/importance_sampling_ratio/mean": 0.22684511542320251,
"sampling/importance_sampling_ratio/min": 0.026074819266796112,
"sampling/sampling_logp_difference/max": 2.0682249069213867,
"sampling/sampling_logp_difference/mean": 0.8481977581977844,
"step": 43,
"step_time": 5.7263973810077005
},
{
"clip_ratio/high_max": 0.15625,
"clip_ratio/high_mean": 0.09375,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.09375,
"entropy": 5.03247994184494,
"epoch": 0.00044,
"grad_norm": 0.06387817859649658,
"kl": 0.5371211282908916,
"learning_rate": 9.999999997582713e-06,
"loss": 0.0004,
"step": 44,
"step_time": 3.3802060630041524
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 5.0,
"completions/max_terminated_length": 5.0,
"completions/mean_length": 2.09375,
"completions/mean_terminated_length": 2.09375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 4.975203037261963,
"epoch": 0.00045,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.13102945685386658,
"kl": 0.6513971909880638,
"learning_rate": 9.999999996940621e-06,
"loss": -0.0028,
"num_tokens": 817273.0,
"reward": -0.587563157081604,
"reward_std": 0.7007678747177124,
"rewards/rollout_reward_func/mean": -0.587563157081604,
"rewards/rollout_reward_func/std": 0.7760494947433472,
"sampling/importance_sampling_ratio/max": 0.3387902081012726,
"sampling/importance_sampling_ratio/mean": 0.2464321404695511,
"sampling/importance_sampling_ratio/min": 7.80636619310826e-05,
"sampling/sampling_logp_difference/max": 4.357866287231445,
"sampling/sampling_logp_difference/mean": 0.8462474346160889,
"step": 45,
"step_time": 6.2393272499975865
},
{
"clip_ratio/high_max": 0.0625,
"clip_ratio/high_mean": 0.03125,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.03125,
"clip_ratio/region_mean": 0.09375,
"entropy": 4.856449127197266,
"epoch": 0.00046,
"grad_norm": 0.0859452411532402,
"kl": 0.6537227220833302,
"learning_rate": 9.99999999622299e-06,
"loss": -0.0031,
"step": 46,
"step_time": 2.901350881998951
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2.0,
"completions/max_terminated_length": 2.0,
"completions/mean_length": 2.0,
"completions/mean_terminated_length": 2.0,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 4.492773771286011,
"epoch": 0.00047,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.14956164360046387,
"kl": 0.5590145848691463,
"learning_rate": 9.999999995429816e-06,
"loss": -0.0016,
"num_tokens": 853016.0,
"reward": -0.2771303355693817,
"reward_std": 0.7537246942520142,
"rewards/rollout_reward_func/mean": -0.2771303355693817,
"rewards/rollout_reward_func/std": 0.7401061654090881,
"sampling/importance_sampling_ratio/max": 0.34557926654815674,
"sampling/importance_sampling_ratio/mean": 0.2762402594089508,
"sampling/importance_sampling_ratio/min": 0.03973078727722168,
"sampling/sampling_logp_difference/max": 1.9350109100341797,
"sampling/sampling_logp_difference/mean": 0.7055625915527344,
"step": 47,
"step_time": 5.625663070004521
},
{
"clip_ratio/high_max": 0.09375,
"clip_ratio/high_mean": 0.046875,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.078125,
"entropy": 4.3877677619457245,
"epoch": 0.00048,
"grad_norm": 0.06713134795427322,
"kl": 0.578897014260292,
"learning_rate": 9.999999994561102e-06,
"loss": -0.0019,
"step": 48,
"step_time": 2.886007981996954
},
{
"clip_ratio/high_max": 0.013888888992369175,
"clip_ratio/high_mean": 0.0069444444961845875,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0069444444961845875,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 16.0,
"completions/max_terminated_length": 5.0,
"completions/mean_length": 2.96875,
"completions/mean_terminated_length": 2.1000001430511475,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 4.886862337589264,
"epoch": 0.00049,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3026506304740906,
"kl": 0.6998865567147732,
"learning_rate": 9.99999999361685e-06,
"loss": -0.0043,
"num_tokens": 888637.0,
"reward": -0.3646780252456665,
"reward_std": 0.7392382025718689,
"rewards/rollout_reward_func/mean": -0.3646780252456665,
"rewards/rollout_reward_func/std": 0.7437232136726379,
"sampling/importance_sampling_ratio/max": 0.5906126499176025,
"sampling/importance_sampling_ratio/mean": 0.2536194622516632,
"sampling/importance_sampling_ratio/min": 5.002554794020231e-12,
"sampling/sampling_logp_difference/max": 5.405303001403809,
"sampling/sampling_logp_difference/mean": 1.139528751373291,
"step": 49,
"step_time": 5.81962778799425
},
{
"clip_ratio/high_max": 0.045138888992369175,
"clip_ratio/high_mean": 0.03819444449618459,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.03125,
"clip_ratio/region_mean": 0.13194444426335394,
"entropy": 4.753438889980316,
"epoch": 0.0005,
"grad_norm": 0.22047115862369537,
"kl": 0.8953660875558853,
"learning_rate": 9.999999992597058e-06,
"loss": -0.0044,
"step": 50,
"step_time": 3.4302545330028806
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 16.0,
"completions/max_terminated_length": 11.0,
"completions/mean_length": 2.71875,
"completions/mean_terminated_length": 2.2903225421905518,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 4.553148508071899,
"epoch": 0.00051,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.14803296327590942,
"kl": 0.8594339191913605,
"learning_rate": 9.999999991501723e-06,
"loss": 0.0001,
"num_tokens": 925783.0,
"reward": -0.47938936948776245,
"reward_std": 0.6224657893180847,
"rewards/rollout_reward_func/mean": -0.47938936948776245,
"rewards/rollout_reward_func/std": 0.6325410604476929,
"sampling/importance_sampling_ratio/max": 0.8112522959709167,
"sampling/importance_sampling_ratio/mean": 0.29810550808906555,
"sampling/importance_sampling_ratio/min": 3.594766628464696e-13,
"sampling/sampling_logp_difference/max": 5.109455108642578,
"sampling/sampling_logp_difference/mean": 1.0870777368545532,
"step": 51,
"step_time": 6.738885986007517
},
{
"clip_ratio/high_max": 0.03125,
"clip_ratio/high_mean": 0.02524038404226303,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02524038404226303,
"entropy": 4.535911321640015,
"epoch": 0.00052,
"grad_norm": 0.04543463885784149,
"kl": 0.7985228635370731,
"learning_rate": 9.99999999033085e-06,
"loss": -0.0004,
"step": 52,
"step_time": 3.152213782999752
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2.0,
"completions/max_terminated_length": 2.0,
"completions/mean_length": 2.0,
"completions/mean_terminated_length": 2.0,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 4.083370506763458,
"epoch": 0.00053,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.04525892809033394,
"kl": 0.874318428337574,
"learning_rate": 9.999999989084436e-06,
"loss": -0.0025,
"num_tokens": 961105.0,
"reward": -0.21413108706474304,
"reward_std": 0.5813945531845093,
"rewards/rollout_reward_func/mean": -0.21413108706474304,
"rewards/rollout_reward_func/std": 0.5861169099807739,
"sampling/importance_sampling_ratio/max": 0.7165222764015198,
"sampling/importance_sampling_ratio/mean": 0.3133776783943176,
"sampling/importance_sampling_ratio/min": 0.013980884104967117,
"sampling/sampling_logp_difference/max": 3.070335626602173,
"sampling/sampling_logp_difference/mean": 0.7299262285232544,
"step": 53,
"step_time": 6.067928731994471
},
{
"clip_ratio/high_max": 0.03125,
"clip_ratio/high_mean": 0.015625,
"clip_ratio/low_mean": 0.015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03125,
"entropy": 4.059947609901428,
"epoch": 0.00054,
"grad_norm": 0.07141973823308945,
"kl": 0.9976279065012932,
"learning_rate": 9.99999998776248e-06,
"loss": -0.0025,
"step": 54,
"step_time": 3.118995607001125
},
{
"clip_ratio/high_max": 0.03125,
"clip_ratio/high_mean": 0.015625,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 5.0,
"completions/max_terminated_length": 5.0,
"completions/mean_length": 2.09375,
"completions/mean_terminated_length": 2.09375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 4.261334180831909,
"epoch": 0.00055,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.06153355911374092,
"kl": 0.8010737895965576,
"learning_rate": 9.999999986364988e-06,
"loss": 0.0033,
"num_tokens": 996705.0,
"reward": -0.33633241057395935,
"reward_std": 0.4821242392063141,
"rewards/rollout_reward_func/mean": -0.33633241057395935,
"rewards/rollout_reward_func/std": 0.5220240354537964,
"sampling/importance_sampling_ratio/max": 0.9735277891159058,
"sampling/importance_sampling_ratio/mean": 0.2968878149986267,
"sampling/importance_sampling_ratio/min": 0.0002361015067435801,
"sampling/sampling_logp_difference/max": 4.416370868682861,
"sampling/sampling_logp_difference/mean": 0.8390293717384338,
"step": 55,
"step_time": 5.6938710109971
}
],
"logging_steps": 1.0,
"max_steps": 700000,
"num_input_tokens_seen": 996705,
"num_train_epochs": 7,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}