test_leduc_poker / trainer_state.json
Gege24's picture
Upload task output 1
8645c0c verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.00068,
"eval_steps": 500,
"global_step": 68,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 309.0,
"completions/max_terminated_length": 309.0,
"completions/mean_length": 160.90625,
"completions/mean_terminated_length": 160.90625,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"entropy": 8.38103711605072,
"epoch": 1e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0028616045601665974,
"kl": 0.0,
"learning_rate": 0.0,
"loss": -0.0002,
"num_tokens": 18085.0,
"reward": 0.03125002980232239,
"reward_std": 0.6862481236457825,
"rewards/rollout_reward_func/mean": 0.03125002980232239,
"rewards/rollout_reward_func/std": 1.011366844177246,
"sampling/importance_sampling_ratio/max": 0.010557955130934715,
"sampling/importance_sampling_ratio/mean": 0.003285687882453203,
"sampling/importance_sampling_ratio/min": 7.365059625542847e-13,
"sampling/sampling_logp_difference/max": 10.706700325012207,
"sampling/sampling_logp_difference/mean": 1.4828054904937744,
"step": 1,
"step_time": 5.417499241000769
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 8.38103711605072,
"epoch": 2e-05,
"grad_norm": 0.002821348374709487,
"kl": 0.0,
"learning_rate": 2.8571428571428575e-07,
"loss": -0.0002,
"step": 2,
"step_time": 2.1234856260016386
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 297.0,
"completions/max_terminated_length": 297.0,
"completions/mean_length": 80.875,
"completions/mean_terminated_length": 80.875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 7.941936016082764,
"epoch": 3e-05,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.041607797145843506,
"kl": 0.0004363941061455989,
"learning_rate": 5.714285714285715e-07,
"loss": 0.0023,
"num_tokens": 33737.0,
"reward": -0.13124999403953552,
"reward_std": 0.7598094344139099,
"rewards/rollout_reward_func/mean": -0.13124999403953552,
"rewards/rollout_reward_func/std": 1.0014303922653198,
"sampling/importance_sampling_ratio/max": 0.11680073291063309,
"sampling/importance_sampling_ratio/mean": 0.033671747893095016,
"sampling/importance_sampling_ratio/min": 1.0370337122367346e-06,
"sampling/sampling_logp_difference/max": 4.6255574226379395,
"sampling/sampling_logp_difference/mean": 1.339035987854004,
"step": 3,
"step_time": 3.9910208220007917
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 7.934386253356934,
"epoch": 4e-05,
"grad_norm": 0.04307129234075546,
"kl": 0.0004773353211930953,
"learning_rate": 8.571428571428572e-07,
"loss": 0.0023,
"step": 4,
"step_time": 2.6708831029991416
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 297.0,
"completions/max_terminated_length": 297.0,
"completions/mean_length": 129.0625,
"completions/mean_terminated_length": 129.0625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 8.268575668334961,
"epoch": 5e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.03974386304616928,
"kl": 0.0007551452181360219,
"learning_rate": 1.142857142857143e-06,
"loss": 0.0022,
"num_tokens": 51251.0,
"reward": -0.53125,
"reward_std": 0.5484436750411987,
"rewards/rollout_reward_func/mean": -0.53125,
"rewards/rollout_reward_func/std": 0.8789427280426025,
"sampling/importance_sampling_ratio/max": 0.11807744950056076,
"sampling/importance_sampling_ratio/mean": 0.024541109800338745,
"sampling/importance_sampling_ratio/min": 1.4743216955430405e-13,
"sampling/sampling_logp_difference/max": 10.85576057434082,
"sampling/sampling_logp_difference/mean": 1.503469467163086,
"step": 5,
"step_time": 4.814693420999902
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 8.273618221282959,
"epoch": 6e-05,
"grad_norm": 0.04039419814944267,
"kl": 0.0003924804532289272,
"learning_rate": 1.4285714285714286e-06,
"loss": 0.0022,
"step": 6,
"step_time": 2.0722021359997598
},
{
"clip_ratio/high_max": 0.010416666977107525,
"clip_ratio/high_mean": 0.0052083334885537624,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0052083334885537624,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 369.0,
"completions/max_terminated_length": 369.0,
"completions/mean_length": 159.3125,
"completions/mean_terminated_length": 157.77418518066406,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 8.400971293449402,
"epoch": 7e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.02198818139731884,
"kl": 0.0007719468412688002,
"learning_rate": 1.7142857142857145e-06,
"loss": 0.0013,
"num_tokens": 69453.0,
"reward": -0.6624999642372131,
"reward_std": 0.48922622203826904,
"rewards/rollout_reward_func/mean": -0.6624999642372131,
"rewards/rollout_reward_func/std": 0.8071255087852478,
"sampling/importance_sampling_ratio/max": 0.10809381306171417,
"sampling/importance_sampling_ratio/mean": 0.011609362438321114,
"sampling/importance_sampling_ratio/min": 1.4364835228825295e-22,
"sampling/sampling_logp_difference/max": 11.3631591796875,
"sampling/sampling_logp_difference/mean": 1.7411997318267822,
"step": 7,
"step_time": 4.814280139999937
},
{
"clip_ratio/high_max": 0.014583333861082792,
"clip_ratio/high_mean": 0.007291666930541396,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.007291666930541396,
"entropy": 8.400816917419434,
"epoch": 8e-05,
"grad_norm": 0.021912436932325363,
"kl": 0.000808713368314784,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.0013,
"step": 8,
"step_time": 2.084449717998723
},
{
"clip_ratio/high_max": 0.0052083334885537624,
"clip_ratio/high_mean": 0.0026041667442768812,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0026041667442768812,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 394.0,
"completions/max_terminated_length": 394.0,
"completions/mean_length": 160.40625,
"completions/mean_terminated_length": 156.1666717529297,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 8.220785737037659,
"epoch": 9e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.00841094646602869,
"kl": 0.0007819603888492566,
"learning_rate": 2.285714285714286e-06,
"loss": -0.0006,
"num_tokens": 88018.0,
"reward": -0.6656249761581421,
"reward_std": 0.30112773180007935,
"rewards/rollout_reward_func/mean": -0.6656249761581421,
"rewards/rollout_reward_func/std": 0.8090652823448181,
"sampling/importance_sampling_ratio/max": 0.10453330725431442,
"sampling/importance_sampling_ratio/mean": 0.009016389958560467,
"sampling/importance_sampling_ratio/min": 3.77039369235492e-14,
"sampling/sampling_logp_difference/max": 11.177996635437012,
"sampling/sampling_logp_difference/mean": 1.46906578540802,
"step": 9,
"step_time": 5.767028449001373
},
{
"clip_ratio/high_max": 0.010416666977107525,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"entropy": 8.221423089504242,
"epoch": 0.0001,
"grad_norm": 0.00875311903655529,
"kl": 0.0009726146636239719,
"learning_rate": 2.571428571428571e-06,
"loss": -0.0006,
"step": 10,
"step_time": 3.8331103400005304
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 476.0,
"completions/max_terminated_length": 476.0,
"completions/mean_length": 146.875,
"completions/mean_terminated_length": 146.875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 8.224214434623718,
"epoch": 0.00011,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.005308263469487429,
"kl": 0.0006370486844389234,
"learning_rate": 2.8571428571428573e-06,
"loss": -0.0005,
"num_tokens": 105118.0,
"reward": -0.6531249284744263,
"reward_std": 0.27538806200027466,
"rewards/rollout_reward_func/mean": -0.6531249284744263,
"rewards/rollout_reward_func/std": 0.7935158014297485,
"sampling/importance_sampling_ratio/max": 0.09377396106719971,
"sampling/importance_sampling_ratio/mean": 0.007022843696177006,
"sampling/importance_sampling_ratio/min": 1.1420680792263728e-27,
"sampling/sampling_logp_difference/max": 4.9811320304870605,
"sampling/sampling_logp_difference/mean": 1.3924936056137085,
"step": 11,
"step_time": 5.839474645000337
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 8.226899027824402,
"epoch": 0.00012,
"grad_norm": 0.005312995053827763,
"kl": 0.0006140958357718773,
"learning_rate": 3.142857142857143e-06,
"loss": -0.0005,
"step": 12,
"step_time": 2.6753236439999455
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 294.0,
"completions/max_terminated_length": 294.0,
"completions/mean_length": 188.25,
"completions/mean_terminated_length": 188.25,
"completions/min_length": 87.0,
"completions/min_terminated_length": 87.0,
"entropy": 8.331215858459473,
"epoch": 0.00013,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.0016523015219718218,
"kl": 0.0006200866155268159,
"learning_rate": 3.428571428571429e-06,
"loss": -0.0001,
"num_tokens": 124622.0,
"reward": -1.009374976158142,
"reward_std": 0.0265165027230978,
"rewards/rollout_reward_func/mean": -1.009374976158142,
"rewards/rollout_reward_func/std": 0.03901509940624237,
"sampling/importance_sampling_ratio/max": 0.0077222432009875774,
"sampling/importance_sampling_ratio/mean": 0.0027147922664880753,
"sampling/importance_sampling_ratio/min": 3.714021278022894e-14,
"sampling/sampling_logp_difference/max": 11.583379745483398,
"sampling/sampling_logp_difference/mean": 1.4247815608978271,
"step": 13,
"step_time": 4.811962196999048
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 8.332964897155762,
"epoch": 0.00014,
"grad_norm": 0.0017943360144272447,
"kl": 0.0005818934332637582,
"learning_rate": 3.7142857142857146e-06,
"loss": -0.0001,
"step": 14,
"step_time": 2.0559995119983796
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 314.0,
"completions/max_terminated_length": 314.0,
"completions/mean_length": 134.34375,
"completions/mean_terminated_length": 134.34375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 8.289089798927307,
"epoch": 0.00015,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.006275709718465805,
"kl": 0.0006801459057896864,
"learning_rate": 4.000000000000001e-06,
"loss": -0.0006,
"num_tokens": 142137.0,
"reward": -1.0187499523162842,
"reward_std": 0.04082316905260086,
"rewards/rollout_reward_func/mean": -1.0187499523162842,
"rewards/rollout_reward_func/std": 0.04709291458129883,
"sampling/importance_sampling_ratio/max": 0.1032625362277031,
"sampling/importance_sampling_ratio/mean": 0.010007976554334164,
"sampling/importance_sampling_ratio/min": 1.6740179376029118e-07,
"sampling/sampling_logp_difference/max": 4.918362617492676,
"sampling/sampling_logp_difference/mean": 1.347560167312622,
"step": 15,
"step_time": 4.26574305899976
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 8.295971155166626,
"epoch": 0.00016,
"grad_norm": 0.006583907175809145,
"kl": 0.0010946946458716411,
"learning_rate": 4.2857142857142855e-06,
"loss": -0.0006,
"step": 16,
"step_time": 3.0452185289996123
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 342.0,
"completions/max_terminated_length": 342.0,
"completions/mean_length": 176.59375,
"completions/mean_terminated_length": 176.59375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 8.386258959770203,
"epoch": 0.00017,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.019049938768148422,
"kl": 0.0009286534441343974,
"learning_rate": 4.571428571428572e-06,
"loss": 0.0008,
"num_tokens": 161764.0,
"reward": 0.34687501192092896,
"reward_std": 0.3751429617404938,
"rewards/rollout_reward_func/mean": 0.34687501192092896,
"rewards/rollout_reward_func/std": 0.9510552287101746,
"sampling/importance_sampling_ratio/max": 0.10295701771974564,
"sampling/importance_sampling_ratio/mean": 0.00530514121055603,
"sampling/importance_sampling_ratio/min": 1.1150266265858022e-09,
"sampling/sampling_logp_difference/max": 8.94926643371582,
"sampling/sampling_logp_difference/mean": 1.3967115879058838,
"step": 17,
"step_time": 4.39325438600099
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 8.376275658607483,
"epoch": 0.00018,
"grad_norm": 0.017463266849517822,
"kl": 0.0013417988302535377,
"learning_rate": 4.857142857142858e-06,
"loss": 0.0007,
"step": 18,
"step_time": 2.0553056610006024
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 405.0,
"completions/max_terminated_length": 405.0,
"completions/mean_length": 163.53125,
"completions/mean_terminated_length": 163.53125,
"completions/min_length": 62.0,
"completions/min_terminated_length": 62.0,
"entropy": 8.383512496948242,
"epoch": 0.00019,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0029027618002146482,
"kl": 0.0011069331085309386,
"learning_rate": 5.142857142857142e-06,
"loss": -0.0001,
"num_tokens": 180341.0,
"reward": -0.840624988079071,
"reward_std": 0.2795426845550537,
"rewards/rollout_reward_func/mean": -0.840624988079071,
"rewards/rollout_reward_func/std": 0.570715069770813,
"sampling/importance_sampling_ratio/max": 0.009223885834217072,
"sampling/importance_sampling_ratio/mean": 0.002876629587262869,
"sampling/importance_sampling_ratio/min": 3.4141774769962514e-21,
"sampling/sampling_logp_difference/max": 11.647392272949219,
"sampling/sampling_logp_difference/mean": 1.74857759475708,
"step": 19,
"step_time": 5.028274022000915
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 8.370469331741333,
"epoch": 0.0002,
"grad_norm": 0.002040296094492078,
"kl": 0.0010489341875654645,
"learning_rate": 5.428571428571429e-06,
"loss": -0.0001,
"step": 20,
"step_time": 2.1169578549997823
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 373.0,
"completions/max_terminated_length": 373.0,
"completions/mean_length": 155.15625,
"completions/mean_terminated_length": 155.15625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 8.354791045188904,
"epoch": 0.00021,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.014582234434783459,
"kl": 0.0016839846794027835,
"learning_rate": 5.7142857142857145e-06,
"loss": 0.0006,
"num_tokens": 198770.0,
"reward": 0.46875,
"reward_std": 0.6740255355834961,
"rewards/rollout_reward_func/mean": 0.46875,
"rewards/rollout_reward_func/std": 0.9006941318511963,
"sampling/importance_sampling_ratio/max": 0.08340345323085785,
"sampling/importance_sampling_ratio/mean": 0.006231832783669233,
"sampling/importance_sampling_ratio/min": 2.269645449359814e-08,
"sampling/sampling_logp_difference/max": 2.1503682136535645,
"sampling/sampling_logp_difference/mean": 1.321890115737915,
"step": 21,
"step_time": 4.95798639899931
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 8.337073802947998,
"epoch": 0.00022,
"grad_norm": 0.014086912386119366,
"kl": 0.0020258916774764657,
"learning_rate": 6e-06,
"loss": 0.0006,
"step": 22,
"step_time": 2.598393530000976
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0020833334419876337,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0020833334419876337,
"completions/clipped_ratio": 0.0,
"completions/max_length": 417.0,
"completions/max_terminated_length": 417.0,
"completions/mean_length": 152.59375,
"completions/mean_terminated_length": 152.59375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 8.294317483901978,
"epoch": 0.00023,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.019008534029126167,
"kl": 0.004680573721998371,
"learning_rate": 6.285714285714286e-06,
"loss": 0.0008,
"num_tokens": 217253.0,
"reward": -0.53125,
"reward_std": 0.3699263334274292,
"rewards/rollout_reward_func/mean": -0.53125,
"rewards/rollout_reward_func/std": 0.8785756230354309,
"sampling/importance_sampling_ratio/max": 0.09325665980577469,
"sampling/importance_sampling_ratio/mean": 0.009642375633120537,
"sampling/importance_sampling_ratio/min": 1.0861621381728576e-20,
"sampling/sampling_logp_difference/max": 10.062246322631836,
"sampling/sampling_logp_difference/mean": 1.4383571147918701,
"step": 23,
"step_time": 4.488339367000663
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0020833334419876337,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0020833334419876337,
"entropy": 8.280492901802063,
"epoch": 0.00024,
"grad_norm": 0.01694045588374138,
"kl": 0.007116928434697911,
"learning_rate": 6.571428571428572e-06,
"loss": 0.0007,
"step": 24,
"step_time": 2.144616393000433
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 262.0,
"completions/max_terminated_length": 262.0,
"completions/mean_length": 141.1875,
"completions/mean_terminated_length": 141.1875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 8.207048416137695,
"epoch": 0.00025,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.008826951496303082,
"kl": 0.011807448376202956,
"learning_rate": 6.857142857142858e-06,
"loss": -0.0003,
"num_tokens": 235259.0,
"reward": -0.7625000476837158,
"reward_std": 0.2917833626270294,
"rewards/rollout_reward_func/mean": -0.7625000476837158,
"rewards/rollout_reward_func/std": 0.6776382327079773,
"sampling/importance_sampling_ratio/max": 0.06867893040180206,
"sampling/importance_sampling_ratio/mean": 0.007195095531642437,
"sampling/importance_sampling_ratio/min": 8.830933421045788e-15,
"sampling/sampling_logp_difference/max": 4.469038486480713,
"sampling/sampling_logp_difference/mean": 1.387784719467163,
"step": 25,
"step_time": 5.051108140999531
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 8.201621413230896,
"epoch": 0.00026,
"grad_norm": 0.009136058390140533,
"kl": 0.014795138500630856,
"learning_rate": 7.1428571428571436e-06,
"loss": -0.0003,
"step": 26,
"step_time": 2.5399822110020978
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 290.0,
"completions/max_terminated_length": 290.0,
"completions/mean_length": 136.9375,
"completions/mean_terminated_length": 136.9375,
"completions/min_length": 87.0,
"completions/min_terminated_length": 87.0,
"entropy": 8.213225603103638,
"epoch": 0.00027,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.002163404831662774,
"kl": 0.0031390516232931986,
"learning_rate": 7.428571428571429e-06,
"loss": -0.0001,
"num_tokens": 252577.0,
"reward": -0.3937499523162842,
"reward_std": 0.2508378326892853,
"rewards/rollout_reward_func/mean": -0.3937499523162842,
"rewards/rollout_reward_func/std": 0.9557761549949646,
"sampling/importance_sampling_ratio/max": 0.010245459154248238,
"sampling/importance_sampling_ratio/mean": 0.004281196743249893,
"sampling/importance_sampling_ratio/min": 9.54090864979662e-07,
"sampling/sampling_logp_difference/max": 3.967635154724121,
"sampling/sampling_logp_difference/mean": 1.3100483417510986,
"step": 27,
"step_time": 6.014637065999523
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 8.203812718391418,
"epoch": 0.00028,
"grad_norm": 0.002143328543752432,
"kl": 0.003412064048461616,
"learning_rate": 7.714285714285716e-06,
"loss": -0.0001,
"step": 28,
"step_time": 2.5550204580013087
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 402.0,
"completions/max_terminated_length": 402.0,
"completions/mean_length": 146.15625,
"completions/mean_terminated_length": 146.15625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 8.294090986251831,
"epoch": 0.00029,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.014697319827973843,
"kl": 0.025721593061462045,
"learning_rate": 8.000000000000001e-06,
"loss": -0.0002,
"num_tokens": 270222.0,
"reward": -0.703125,
"reward_std": 0.2893909811973572,
"rewards/rollout_reward_func/mean": -0.703125,
"rewards/rollout_reward_func/std": 0.7459500432014465,
"sampling/importance_sampling_ratio/max": 0.05455424264073372,
"sampling/importance_sampling_ratio/mean": 0.006639046128839254,
"sampling/importance_sampling_ratio/min": 5.018679106327676e-15,
"sampling/sampling_logp_difference/max": 9.214568138122559,
"sampling/sampling_logp_difference/mean": 1.469120979309082,
"step": 29,
"step_time": 5.175338321999334
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 8.281907558441162,
"epoch": 0.0003,
"grad_norm": 0.01356051117181778,
"kl": 0.02538611611817032,
"learning_rate": 8.285714285714287e-06,
"loss": -0.0002,
"step": 30,
"step_time": 2.570307294000486
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 217.0,
"completions/max_terminated_length": 217.0,
"completions/mean_length": 122.40625,
"completions/mean_terminated_length": 122.40625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 7.991218149662018,
"epoch": 0.00031,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0034507170785218477,
"kl": 0.012121076317271218,
"learning_rate": 8.571428571428571e-06,
"loss": 0.0,
"num_tokens": 287003.0,
"reward": -1.0,
"reward_std": 0.0,
"rewards/rollout_reward_func/mean": -1.0,
"rewards/rollout_reward_func/std": 0.0,
"sampling/importance_sampling_ratio/max": 0.06423835456371307,
"sampling/importance_sampling_ratio/mean": 0.006418607663363218,
"sampling/importance_sampling_ratio/min": 2.267779519726787e-09,
"sampling/sampling_logp_difference/max": 7.289473533630371,
"sampling/sampling_logp_difference/mean": 1.3377501964569092,
"step": 31,
"step_time": 4.991982824997649
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 7.985842347145081,
"epoch": 0.00032,
"grad_norm": 0.0031677871011197567,
"kl": 0.01152854437532369,
"learning_rate": 8.857142857142858e-06,
"loss": 0.0,
"step": 32,
"step_time": 2.551680210001905
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 290.0,
"completions/max_terminated_length": 290.0,
"completions/mean_length": 141.4375,
"completions/mean_terminated_length": 141.4375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 8.251873970031738,
"epoch": 0.00033,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.011402878910303116,
"kl": 0.015730057610198855,
"learning_rate": 9.142857142857144e-06,
"loss": -0.0004,
"num_tokens": 305185.0,
"reward": -0.3999999761581421,
"reward_std": 0.5431233644485474,
"rewards/rollout_reward_func/mean": -0.3999999761581421,
"rewards/rollout_reward_func/std": 0.9466408491134644,
"sampling/importance_sampling_ratio/max": 0.05344817042350769,
"sampling/importance_sampling_ratio/mean": 0.005628373473882675,
"sampling/importance_sampling_ratio/min": 1.8508519615559533e-23,
"sampling/sampling_logp_difference/max": 13.587614059448242,
"sampling/sampling_logp_difference/mean": 1.4650746583938599,
"step": 33,
"step_time": 6.090412677998756
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 8.246023058891296,
"epoch": 0.00034,
"grad_norm": 0.008096246048808098,
"kl": 0.014423061889829114,
"learning_rate": 9.42857142857143e-06,
"loss": -0.0004,
"step": 34,
"step_time": 2.5453261089996886
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 282.0,
"completions/max_terminated_length": 282.0,
"completions/mean_length": 132.625,
"completions/mean_terminated_length": 132.625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 8.162899851799011,
"epoch": 0.00035,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.004175754263997078,
"kl": 0.018283673271071166,
"learning_rate": 9.714285714285715e-06,
"loss": -0.0001,
"num_tokens": 323325.0,
"reward": -0.5718749761581421,
"reward_std": 0.18343815207481384,
"rewards/rollout_reward_func/mean": -0.5718749761581421,
"rewards/rollout_reward_func/std": 0.8301706910133362,
"sampling/importance_sampling_ratio/max": 0.062451351433992386,
"sampling/importance_sampling_ratio/mean": 0.007372735999524593,
"sampling/importance_sampling_ratio/min": 1.2278145562505762e-20,
"sampling/sampling_logp_difference/max": 9.404362678527832,
"sampling/sampling_logp_difference/mean": 1.5296763181686401,
"step": 35,
"step_time": 5.175936893000653
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 8.150159418582916,
"epoch": 0.00036,
"grad_norm": 0.004381407983601093,
"kl": 0.017532640282297507,
"learning_rate": 1e-05,
"loss": -0.0001,
"step": 36,
"step_time": 2.551337270998374
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 350.0,
"completions/max_terminated_length": 350.0,
"completions/mean_length": 193.5625,
"completions/mean_terminated_length": 193.5625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 8.269986033439636,
"epoch": 0.00037,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.008302879519760609,
"kl": 0.009193298814352602,
"learning_rate": 9.9999999995372e-06,
"loss": 0.0005,
"num_tokens": 342711.0,
"reward": -0.03749999403953552,
"reward_std": 0.7515531778335571,
"rewards/rollout_reward_func/mean": -0.03749999403953552,
"rewards/rollout_reward_func/std": 1.0247737169265747,
"sampling/importance_sampling_ratio/max": 0.06419403851032257,
"sampling/importance_sampling_ratio/mean": 0.003890752326697111,
"sampling/importance_sampling_ratio/min": 1.164774526829504e-10,
"sampling/sampling_logp_difference/max": 3.794468402862549,
"sampling/sampling_logp_difference/mean": 1.2929356098175049,
"step": 37,
"step_time": 4.535253317000752
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 8.270979046821594,
"epoch": 0.00038,
"grad_norm": 0.00880998931825161,
"kl": 0.007889840024290606,
"learning_rate": 9.999999998148802e-06,
"loss": 0.0005,
"step": 38,
"step_time": 2.492936824000026
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 347.0,
"completions/max_terminated_length": 347.0,
"completions/mean_length": 171.09375,
"completions/mean_terminated_length": 172.93548583984375,
"completions/min_length": 89.0,
"completions/min_terminated_length": 89.0,
"entropy": 8.237572193145752,
"epoch": 0.00039,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.003999819979071617,
"kl": 0.003984437556937337,
"learning_rate": 9.999999995834804e-06,
"loss": -0.0003,
"num_tokens": 361754.0,
"reward": -0.2750000059604645,
"reward_std": 0.29003918170928955,
"rewards/rollout_reward_func/mean": -0.2750000059604645,
"rewards/rollout_reward_func/std": 0.9721940755844116,
"sampling/importance_sampling_ratio/max": 0.00961074884980917,
"sampling/importance_sampling_ratio/mean": 0.0027457564137876034,
"sampling/importance_sampling_ratio/min": 1.3412947045145382e-17,
"sampling/sampling_logp_difference/max": 10.32085132598877,
"sampling/sampling_logp_difference/mean": 1.3388471603393555,
"step": 39,
"step_time": 4.8369908450004
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 8.241283535957336,
"epoch": 0.0004,
"grad_norm": 0.003973928280174732,
"kl": 0.004160504468018189,
"learning_rate": 9.999999992595207e-06,
"loss": -0.0003,
"step": 40,
"step_time": 2.064312154999243
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 408.0,
"completions/max_terminated_length": 408.0,
"completions/mean_length": 149.6875,
"completions/mean_terminated_length": 150.7096710205078,
"completions/min_length": 87.0,
"completions/min_terminated_length": 87.0,
"entropy": 8.331173419952393,
"epoch": 0.00041,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.0036302730441093445,
"kl": 0.002889991897973232,
"learning_rate": 9.999999988430008e-06,
"loss": -0.0,
"num_tokens": 380000.0,
"reward": -0.40312498807907104,
"reward_std": 0.5359131097793579,
"rewards/rollout_reward_func/mean": -0.40312498807907104,
"rewards/rollout_reward_func/std": 0.9361845850944519,
"sampling/importance_sampling_ratio/max": 0.009129444137215614,
"sampling/importance_sampling_ratio/mean": 0.003908202983438969,
"sampling/importance_sampling_ratio/min": 3.258544557229945e-14,
"sampling/sampling_logp_difference/max": 4.673151016235352,
"sampling/sampling_logp_difference/mean": 1.426315426826477,
"step": 41,
"step_time": 4.397045022999009
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 8.317348182201385,
"epoch": 0.00042,
"grad_norm": 0.002291926182806492,
"kl": 0.002586090617114678,
"learning_rate": 9.999999983339212e-06,
"loss": -0.0,
"step": 42,
"step_time": 2.1006949159991564
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 408.0,
"completions/max_terminated_length": 408.0,
"completions/mean_length": 168.1875,
"completions/mean_terminated_length": 168.1875,
"completions/min_length": 89.0,
"completions/min_terminated_length": 89.0,
"entropy": 8.245356917381287,
"epoch": 0.00043,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.004080729093402624,
"kl": 0.003670690639410168,
"learning_rate": 9.999999977322818e-06,
"loss": -0.0004,
"num_tokens": 398190.0,
"reward": -0.5843750238418579,
"reward_std": 0.22265997529029846,
"rewards/rollout_reward_func/mean": -0.5843750238418579,
"rewards/rollout_reward_func/std": 0.8462857007980347,
"sampling/importance_sampling_ratio/max": 0.010589290410280228,
"sampling/importance_sampling_ratio/mean": 0.003948138561099768,
"sampling/importance_sampling_ratio/min": 1.0654254561925924e-10,
"sampling/sampling_logp_difference/max": 9.166659355163574,
"sampling/sampling_logp_difference/mean": 1.3605971336364746,
"step": 43,
"step_time": 4.460575121997863
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 8.227819621562958,
"epoch": 0.00044,
"grad_norm": 0.004100060556083918,
"kl": 0.003359506925335154,
"learning_rate": 9.999999970380822e-06,
"loss": -0.0004,
"step": 44,
"step_time": 3.0909940020010254
},
{
"clip_ratio/high_max": 0.01785714365541935,
"clip_ratio/high_mean": 0.008928571827709675,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.008928571827709675,
"completions/clipped_ratio": 0.0,
"completions/max_length": 297.0,
"completions/max_terminated_length": 297.0,
"completions/mean_length": 160.40625,
"completions/mean_terminated_length": 160.40625,
"completions/min_length": 87.0,
"completions/min_terminated_length": 87.0,
"entropy": 8.268202066421509,
"epoch": 0.00045,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.0023337905295193195,
"kl": 0.0028031190449837595,
"learning_rate": 9.999999962513228e-06,
"loss": -0.0001,
"num_tokens": 416179.0,
"reward": -0.5843750238418579,
"reward_std": 0.20319493114948273,
"rewards/rollout_reward_func/mean": -0.5843750238418579,
"rewards/rollout_reward_func/std": 0.830510675907135,
"sampling/importance_sampling_ratio/max": 0.010615027509629726,
"sampling/importance_sampling_ratio/mean": 0.003981100395321846,
"sampling/importance_sampling_ratio/min": 1.0911494689562484e-13,
"sampling/sampling_logp_difference/max": 11.518604278564453,
"sampling/sampling_logp_difference/mean": 1.4530669450759888,
"step": 45,
"step_time": 4.283326912999655
},
{
"clip_ratio/high_max": 0.01785714365541935,
"clip_ratio/high_mean": 0.008928571827709675,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.008928571827709675,
"entropy": 8.239543914794922,
"epoch": 0.00046,
"grad_norm": 0.0035347214434295893,
"kl": 0.0033496549731353298,
"learning_rate": 9.999999953720035e-06,
"loss": -0.0001,
"step": 46,
"step_time": 2.0328167930001655
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 346.0,
"completions/max_terminated_length": 346.0,
"completions/mean_length": 154.96875,
"completions/mean_terminated_length": 154.96875,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"entropy": 8.14276933670044,
"epoch": 0.00047,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.003259505843743682,
"kl": 0.005610214080661535,
"learning_rate": 9.99999994400124e-06,
"loss": -0.0004,
"num_tokens": 434762.0,
"reward": -1.0218749046325684,
"reward_std": 0.04966200143098831,
"rewards/rollout_reward_func/mean": -1.0218749046325684,
"rewards/rollout_reward_func/std": 0.0490843690931797,
"sampling/importance_sampling_ratio/max": 0.011610783636569977,
"sampling/importance_sampling_ratio/mean": 0.003400737652555108,
"sampling/importance_sampling_ratio/min": 9.674043361674659e-17,
"sampling/sampling_logp_difference/max": 10.514336585998535,
"sampling/sampling_logp_difference/mean": 1.5065617561340332,
"step": 47,
"step_time": 4.387635872000828
},
{
"clip_ratio/high_max": 0.004807692486792803,
"clip_ratio/high_mean": 0.0024038462433964014,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0024038462433964014,
"entropy": 8.125671744346619,
"epoch": 0.00048,
"grad_norm": 0.0028330760542303324,
"kl": 0.005603832833003253,
"learning_rate": 9.999999933356848e-06,
"loss": -0.0004,
"step": 48,
"step_time": 2.031070074998752
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 265.0,
"completions/max_terminated_length": 265.0,
"completions/mean_length": 153.46875,
"completions/mean_terminated_length": 154.61289978027344,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 8.048729181289673,
"epoch": 0.00049,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.005494742188602686,
"kl": 0.03256297600455582,
"learning_rate": 9.999999921786855e-06,
"loss": -0.0001,
"num_tokens": 453105.0,
"reward": -0.078125,
"reward_std": 0.5431658029556274,
"rewards/rollout_reward_func/mean": -0.078125,
"rewards/rollout_reward_func/std": 1.0044207572937012,
"sampling/importance_sampling_ratio/max": 0.05582950636744499,
"sampling/importance_sampling_ratio/mean": 0.004962150938808918,
"sampling/importance_sampling_ratio/min": 2.203801160657881e-12,
"sampling/sampling_logp_difference/max": 9.083192825317383,
"sampling/sampling_logp_difference/mean": 1.358087182044983,
"step": 49,
"step_time": 4.150341610999931
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 8.026359498500824,
"epoch": 0.0005,
"grad_norm": 0.005025926977396011,
"kl": 0.02888420899398625,
"learning_rate": 9.999999909291265e-06,
"loss": -0.0001,
"step": 50,
"step_time": 2.9815445680005723
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 290.0,
"completions/max_terminated_length": 290.0,
"completions/mean_length": 143.65625,
"completions/mean_terminated_length": 143.65625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 8.10213428735733,
"epoch": 0.00051,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.001717552193440497,
"kl": 0.011197627696674317,
"learning_rate": 9.999999895870075e-06,
"loss": -0.0002,
"num_tokens": 470670.0,
"reward": -1.015625,
"reward_std": 0.036278266459703445,
"rewards/rollout_reward_func/mean": -1.015625,
"rewards/rollout_reward_func/std": 0.05148990824818611,
"sampling/importance_sampling_ratio/max": 0.06585416942834854,
"sampling/importance_sampling_ratio/mean": 0.006765150930732489,
"sampling/importance_sampling_ratio/min": 1.496036702519632e-06,
"sampling/sampling_logp_difference/max": 4.3300042152404785,
"sampling/sampling_logp_difference/mean": 1.3094828128814697,
"step": 51,
"step_time": 4.126140448999649
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 8.08252465724945,
"epoch": 0.00052,
"grad_norm": 0.0015695166075602174,
"kl": 0.0106147377518937,
"learning_rate": 9.999999881523285e-06,
"loss": -0.0002,
"step": 52,
"step_time": 2.0502745620005953
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 314.0,
"completions/max_terminated_length": 314.0,
"completions/mean_length": 173.5,
"completions/mean_terminated_length": 175.29031372070312,
"completions/min_length": 91.0,
"completions/min_terminated_length": 91.0,
"entropy": 7.971049547195435,
"epoch": 0.00053,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.002233212348073721,
"kl": 0.004628648399375379,
"learning_rate": 9.999999866250896e-06,
"loss": -0.0001,
"num_tokens": 489622.0,
"reward": 0.04375000298023224,
"reward_std": 0.6950876712799072,
"rewards/rollout_reward_func/mean": 0.04375000298023224,
"rewards/rollout_reward_func/std": 1.0162986516952515,
"sampling/importance_sampling_ratio/max": 0.014256482943892479,
"sampling/importance_sampling_ratio/mean": 0.0037497361190617085,
"sampling/importance_sampling_ratio/min": 3.703341886623912e-13,
"sampling/sampling_logp_difference/max": 10.441887855529785,
"sampling/sampling_logp_difference/mean": 1.3701958656311035,
"step": 53,
"step_time": 4.342945265999333
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 7.953936517238617,
"epoch": 0.00054,
"grad_norm": 0.0023060261737555265,
"kl": 0.004763354663737118,
"learning_rate": 9.999999850052909e-06,
"loss": -0.0001,
"step": 54,
"step_time": 2.074805829001889
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 469.0,
"completions/max_terminated_length": 469.0,
"completions/mean_length": 133.125,
"completions/mean_terminated_length": 133.61289978027344,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 7.858851313591003,
"epoch": 0.00055,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.010865299962460995,
"kl": 0.05283547495491803,
"learning_rate": 9.99999983292932e-06,
"loss": 0.0005,
"num_tokens": 507130.0,
"reward": -0.08750000596046448,
"reward_std": 0.2218937873840332,
"rewards/rollout_reward_func/mean": -0.08750000596046448,
"rewards/rollout_reward_func/std": 1.0082721710205078,
"sampling/importance_sampling_ratio/max": 0.0794573649764061,
"sampling/importance_sampling_ratio/mean": 0.01186932623386383,
"sampling/importance_sampling_ratio/min": 2.8194972302655857e-18,
"sampling/sampling_logp_difference/max": 11.174360275268555,
"sampling/sampling_logp_difference/mean": 1.4186177253723145,
"step": 55,
"step_time": 4.7503355889994054
},
{
"clip_ratio/high_max": 0.005681818351149559,
"clip_ratio/high_mean": 0.0028409091755747795,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0028409091755747795,
"entropy": 7.84703141450882,
"epoch": 0.00056,
"grad_norm": 0.009876980446279049,
"kl": 0.04835269978502765,
"learning_rate": 9.999999814880132e-06,
"loss": 0.0005,
"step": 56,
"step_time": 2.697479030000977
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 309.0,
"completions/max_terminated_length": 309.0,
"completions/mean_length": 161.03125,
"completions/mean_terminated_length": 161.03125,
"completions/min_length": 87.0,
"completions/min_terminated_length": 87.0,
"entropy": 7.946120798587799,
"epoch": 0.00057,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.00416550925001502,
"kl": 0.005225592787610367,
"learning_rate": 9.999999795905347e-06,
"loss": 0.0003,
"num_tokens": 524811.0,
"reward": -0.328125,
"reward_std": 0.7972557544708252,
"rewards/rollout_reward_func/mean": -0.328125,
"rewards/rollout_reward_func/std": 0.9642762541770935,
"sampling/importance_sampling_ratio/max": 0.011825657449662685,
"sampling/importance_sampling_ratio/mean": 0.004940683953464031,
"sampling/importance_sampling_ratio/min": 7.442185295759504e-14,
"sampling/sampling_logp_difference/max": 10.764989852905273,
"sampling/sampling_logp_difference/mean": 1.3635454177856445,
"step": 57,
"step_time": 4.399483632999363
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 7.954219579696655,
"epoch": 0.00058,
"grad_norm": 0.004001974128186703,
"kl": 0.005270991328870878,
"learning_rate": 9.999999776004962e-06,
"loss": 0.0003,
"step": 58,
"step_time": 2.068970861998423
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 305.0,
"completions/max_terminated_length": 305.0,
"completions/mean_length": 137.78125,
"completions/mean_terminated_length": 137.78125,
"completions/min_length": 100.0,
"completions/min_terminated_length": 100.0,
"entropy": 7.7305819392204285,
"epoch": 0.00059,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.002240613801404834,
"kl": 0.007570188608951867,
"learning_rate": 9.999999755178978e-06,
"loss": -0.0002,
"num_tokens": 542324.0,
"reward": -0.565625011920929,
"reward_std": 0.18561552464962006,
"rewards/rollout_reward_func/mean": -0.565625011920929,
"rewards/rollout_reward_func/std": 0.841890811920166,
"sampling/importance_sampling_ratio/max": 0.014670869335532188,
"sampling/importance_sampling_ratio/mean": 0.006734498776495457,
"sampling/importance_sampling_ratio/min": 9.527041698945138e-13,
"sampling/sampling_logp_difference/max": 10.319395065307617,
"sampling/sampling_logp_difference/mean": 1.346876621246338,
"step": 59,
"step_time": 4.076921171999857
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 7.7493292689323425,
"epoch": 0.0006,
"grad_norm": 0.0021323147229850292,
"kl": 0.007373731641564518,
"learning_rate": 9.999999733427394e-06,
"loss": -0.0002,
"step": 60,
"step_time": 2.065178036998077
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 305.0,
"completions/max_terminated_length": 305.0,
"completions/mean_length": 131.15625,
"completions/mean_terminated_length": 132.53334045410156,
"completions/min_length": 100.0,
"completions/min_terminated_length": 100.0,
"entropy": 7.80867725610733,
"epoch": 0.00061,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.002022077329456806,
"kl": 0.004840011039050296,
"learning_rate": 9.99999971075021e-06,
"loss": -0.0003,
"num_tokens": 559385.0,
"reward": -0.25312501192092896,
"reward_std": 0.2761000692844391,
"rewards/rollout_reward_func/mean": -0.25312501192092896,
"rewards/rollout_reward_func/std": 0.9863533973693848,
"sampling/importance_sampling_ratio/max": 0.011284389533102512,
"sampling/importance_sampling_ratio/mean": 0.006653377786278725,
"sampling/importance_sampling_ratio/min": 2.8632414147966578e-11,
"sampling/sampling_logp_difference/max": 4.169203758239746,
"sampling/sampling_logp_difference/mean": 1.2035651206970215,
"step": 61,
"step_time": 4.5231009249973795
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 7.829549431800842,
"epoch": 0.00062,
"grad_norm": 0.0023180257994681597,
"kl": 0.004549442324787378,
"learning_rate": 9.999999687147426e-06,
"loss": -0.0003,
"step": 62,
"step_time": 2.04733120399942
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 212.0,
"completions/max_terminated_length": 212.0,
"completions/mean_length": 123.1875,
"completions/mean_terminated_length": 123.1875,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"entropy": 8.036745607852936,
"epoch": 0.00063,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.0024225530214607716,
"kl": 0.007575233932584524,
"learning_rate": 9.999999662619046e-06,
"loss": -0.0004,
"num_tokens": 577175.0,
"reward": -0.6437500715255737,
"reward_std": 0.2552982568740845,
"rewards/rollout_reward_func/mean": -0.6437500715255737,
"rewards/rollout_reward_func/std": 0.7873751521110535,
"sampling/importance_sampling_ratio/max": 0.010909978300333023,
"sampling/importance_sampling_ratio/mean": 0.005395432468503714,
"sampling/importance_sampling_ratio/min": 2.1033281019655625e-11,
"sampling/sampling_logp_difference/max": 10.216733932495117,
"sampling/sampling_logp_difference/mean": 1.502685546875,
"step": 63,
"step_time": 3.9012105610008803
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 8.038543045520782,
"epoch": 0.00064,
"grad_norm": 0.0024051358923316,
"kl": 0.007686805154662579,
"learning_rate": 9.999999637165062e-06,
"loss": -0.0004,
"step": 64,
"step_time": 2.0093538759983858
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 282.0,
"completions/max_terminated_length": 282.0,
"completions/mean_length": 141.65625,
"completions/mean_terminated_length": 141.65625,
"completions/min_length": 62.0,
"completions/min_terminated_length": 62.0,
"entropy": 8.191402852535248,
"epoch": 0.00065,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.003754268866032362,
"kl": 0.005535919044632465,
"learning_rate": 9.999999610785483e-06,
"loss": -0.0,
"num_tokens": 594644.0,
"reward": -0.33125001192092896,
"reward_std": 0.4657542407512665,
"rewards/rollout_reward_func/mean": -0.33125001192092896,
"rewards/rollout_reward_func/std": 0.9663491249084473,
"sampling/importance_sampling_ratio/max": 0.010239495895802975,
"sampling/importance_sampling_ratio/mean": 0.004091148264706135,
"sampling/importance_sampling_ratio/min": 1.6428119986328787e-13,
"sampling/sampling_logp_difference/max": 11.840818405151367,
"sampling/sampling_logp_difference/mean": 1.458545446395874,
"step": 65,
"step_time": 4.098211152998374
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 8.207817077636719,
"epoch": 0.00066,
"grad_norm": 0.0028874659910798073,
"kl": 0.005369087448343635,
"learning_rate": 9.999999583480304e-06,
"loss": -0.0,
"step": 66,
"step_time": 2.03046784199978
},
{
"clip_ratio/high_max": 0.005434782709926367,
"clip_ratio/high_mean": 0.0027173913549631834,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0027173913549631834,
"completions/clipped_ratio": 0.0,
"completions/max_length": 286.0,
"completions/max_terminated_length": 286.0,
"completions/mean_length": 131.0625,
"completions/mean_terminated_length": 131.0625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 8.138030529022217,
"epoch": 0.00067,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.008762937970459461,
"kl": 0.03389387877541594,
"learning_rate": 9.999999555249524e-06,
"loss": 0.0004,
"num_tokens": 612302.0,
"reward": -0.32500001788139343,
"reward_std": 0.44983184337615967,
"rewards/rollout_reward_func/mean": -0.32500001788139343,
"rewards/rollout_reward_func/std": 0.9615175724029541,
"sampling/importance_sampling_ratio/max": 0.06631788611412048,
"sampling/importance_sampling_ratio/mean": 0.009149353951215744,
"sampling/importance_sampling_ratio/min": 2.947741995564272e-15,
"sampling/sampling_logp_difference/max": 4.271122455596924,
"sampling/sampling_logp_difference/mean": 1.307502269744873,
"step": 67,
"step_time": 4.921273821997602
},
{
"clip_ratio/high_max": 0.005434782709926367,
"clip_ratio/high_mean": 0.0027173913549631834,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0027173913549631834,
"entropy": 8.132037341594696,
"epoch": 0.00068,
"grad_norm": 0.009227960370481014,
"kl": 0.03418682742631063,
"learning_rate": 9.999999526093148e-06,
"loss": 0.0004,
"step": 68,
"step_time": 2.032772711999314
}
],
"logging_steps": 1.0,
"max_steps": 200000,
"num_input_tokens_seen": 612302,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}