smoke-ld-test / trainer_state.json
Jordansky's picture
Upload task output 100000001
f1b834e verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.00023,
"eval_steps": 500,
"global_step": 23,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1045.0,
"completions/max_terminated_length": 1045.0,
"completions/mean_length": 390.1875,
"completions/mean_terminated_length": 390.1875,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.605668731033802,
"epoch": 1e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.8689773678779602,
"kl": 0.0,
"learning_rate": 0.0,
"loss": -0.009,
"num_tokens": 49602.0,
"reward": 0.49261724948883057,
"reward_std": 1.4282547235488892,
"rewards/rollout_reward_func/mean": 0.49261724948883057,
"rewards/rollout_reward_func/std": 1.4220702648162842,
"sampling/importance_sampling_ratio/max": 1.4268709421157837,
"sampling/importance_sampling_ratio/mean": 0.8554609417915344,
"sampling/importance_sampling_ratio/min": 0.6006377339363098,
"sampling/sampling_logp_difference/max": 0.6218547821044922,
"sampling/sampling_logp_difference/mean": 0.05964243412017822,
"step": 1,
"step_time": 13.768371919000856
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.605668731033802,
"epoch": 2e-05,
"grad_norm": 0.8696622252464294,
"kl": 0.0,
"learning_rate": 5.333333333333333e-07,
"loss": -0.009,
"step": 2,
"step_time": 6.979965120997804
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1373.0,
"completions/max_terminated_length": 1373.0,
"completions/mean_length": 256.3125,
"completions/mean_terminated_length": 256.3125,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.5112766288220882,
"epoch": 3e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.42978498339653015,
"kl": 0.004170067805148392,
"learning_rate": 1.0666666666666667e-06,
"loss": -0.033,
"num_tokens": 91865.0,
"reward": 0.04000457376241684,
"reward_std": 0.8832277655601501,
"rewards/rollout_reward_func/mean": 0.04000457376241684,
"rewards/rollout_reward_func/std": 1.1484198570251465,
"sampling/importance_sampling_ratio/max": 1.1835894584655762,
"sampling/importance_sampling_ratio/mean": 0.8482605218887329,
"sampling/importance_sampling_ratio/min": 0.33848339319229126,
"sampling/sampling_logp_difference/max": 1.006063461303711,
"sampling/sampling_logp_difference/mean": 0.05736350640654564,
"step": 3,
"step_time": 13.413748369999666
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.02083333395421505,
"clip_ratio/low_min": 0.02083333395421505,
"clip_ratio/region_mean": 0.02083333395421505,
"entropy": 0.5041642189025879,
"epoch": 4e-05,
"grad_norm": 0.42657995223999023,
"kl": 0.005512098512326702,
"learning_rate": 1.6e-06,
"loss": -0.0326,
"step": 4,
"step_time": 7.850364476997129
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1745.0,
"completions/max_terminated_length": 1745.0,
"completions/mean_length": 340.96875,
"completions/mean_terminated_length": 340.96875,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.59855717420578,
"epoch": 5e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.033186435699463,
"kl": 0.002424745060807254,
"learning_rate": 2.1333333333333334e-06,
"loss": 0.007,
"num_tokens": 138699.0,
"reward": 0.3879123330116272,
"reward_std": 1.4281163215637207,
"rewards/rollout_reward_func/mean": 0.3879123330116272,
"rewards/rollout_reward_func/std": 1.46486234664917,
"sampling/importance_sampling_ratio/max": 1.9882254600524902,
"sampling/importance_sampling_ratio/mean": 0.8772280812263489,
"sampling/importance_sampling_ratio/min": 2.3855912800740953e-09,
"sampling/sampling_logp_difference/max": 18.587005615234375,
"sampling/sampling_logp_difference/mean": 0.14903730154037476,
"step": 5,
"step_time": 15.827661174997047
},
{
"clip_ratio/high_max": 0.012500000186264515,
"clip_ratio/high_mean": 0.0062500000931322575,
"clip_ratio/low_mean": 0.03258547093719244,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0388354710303247,
"entropy": 0.5908495783805847,
"epoch": 6e-05,
"grad_norm": 0.27227118611335754,
"kl": 0.009793178239533518,
"learning_rate": 2.6666666666666664e-06,
"loss": 0.0062,
"step": 6,
"step_time": 8.810438610000347
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1027.0,
"completions/max_terminated_length": 1027.0,
"completions/mean_length": 231.6875,
"completions/mean_terminated_length": 231.6875,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.5898670703172684,
"epoch": 7e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.3972772359848022,
"kl": 0.002214236554209492,
"learning_rate": 3.2e-06,
"loss": -0.0126,
"num_tokens": 182583.0,
"reward": 0.27162131667137146,
"reward_std": 1.0868947505950928,
"rewards/rollout_reward_func/mean": 0.27162131667137146,
"rewards/rollout_reward_func/std": 1.5912116765975952,
"sampling/importance_sampling_ratio/max": 1.6212953329086304,
"sampling/importance_sampling_ratio/mean": 0.8596766591072083,
"sampling/importance_sampling_ratio/min": 0.3463258147239685,
"sampling/sampling_logp_difference/max": 0.8983482718467712,
"sampling/sampling_logp_difference/mean": 0.06186839938163757,
"step": 7,
"step_time": 12.116631305001647
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.01458333432674408,
"clip_ratio/low_min": 0.008333333767950535,
"clip_ratio/region_mean": 0.01458333432674408,
"entropy": 0.5949340760707855,
"epoch": 8e-05,
"grad_norm": 0.7176365852355957,
"kl": 0.007167384720332848,
"learning_rate": 3.7333333333333333e-06,
"loss": -0.0125,
"step": 8,
"step_time": 7.219443969997883
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 538.0,
"completions/max_terminated_length": 538.0,
"completions/mean_length": 139.40625,
"completions/mean_terminated_length": 139.40625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.560302022844553,
"epoch": 9e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7369521260261536,
"kl": 0.004876748149399646,
"learning_rate": 4.266666666666667e-06,
"loss": -0.0042,
"num_tokens": 222153.0,
"reward": -0.008272513747215271,
"reward_std": 0.5417632460594177,
"rewards/rollout_reward_func/mean": -0.008272513747215271,
"rewards/rollout_reward_func/std": 0.9568253755569458,
"sampling/importance_sampling_ratio/max": 1.1025569438934326,
"sampling/importance_sampling_ratio/mean": 0.843124270439148,
"sampling/importance_sampling_ratio/min": 0.3177212178707123,
"sampling/sampling_logp_difference/max": 0.4518265426158905,
"sampling/sampling_logp_difference/mean": 0.06099293380975723,
"step": 9,
"step_time": 9.450052213000163
},
{
"clip_ratio/high_max": 0.015625,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"entropy": 0.5663095861673355,
"epoch": 0.0001,
"grad_norm": 0.7801238298416138,
"kl": 0.006132202317530755,
"learning_rate": 4.8e-06,
"loss": -0.0044,
"step": 10,
"step_time": 5.177882021996993
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1454.0,
"completions/max_terminated_length": 1454.0,
"completions/mean_length": 456.15625,
"completions/mean_terminated_length": 456.15625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.7295192927122116,
"epoch": 0.00011,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7713008522987366,
"kl": 0.002301187181842579,
"learning_rate": 5.333333333333333e-06,
"loss": -0.0233,
"num_tokens": 276525.0,
"reward": -0.06042708456516266,
"reward_std": 0.5548678636550903,
"rewards/rollout_reward_func/mean": -0.06042708456516266,
"rewards/rollout_reward_func/std": 0.6719298362731934,
"sampling/importance_sampling_ratio/max": 1.2587913274765015,
"sampling/importance_sampling_ratio/mean": 0.7997827529907227,
"sampling/importance_sampling_ratio/min": 0.49780380725860596,
"sampling/sampling_logp_difference/max": 0.3962627649307251,
"sampling/sampling_logp_difference/mean": 0.06726472079753876,
"step": 11,
"step_time": 14.024354443001357
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.005681818351149559,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005681818351149559,
"entropy": 0.724354475736618,
"epoch": 0.00012,
"grad_norm": 0.5204576849937439,
"kl": 0.004727993551568943,
"learning_rate": 5.866666666666666e-06,
"loss": -0.0237,
"step": 12,
"step_time": 8.684970894000799
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1063.0,
"completions/max_terminated_length": 1063.0,
"completions/mean_length": 458.78125,
"completions/mean_terminated_length": 458.78125,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.6593378074467182,
"epoch": 0.00013,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.2321019172668457,
"kl": 0.026883443380938843,
"learning_rate": 6.4e-06,
"loss": -0.0337,
"num_tokens": 329014.0,
"reward": 0.5792493224143982,
"reward_std": 1.281329870223999,
"rewards/rollout_reward_func/mean": 0.5792493224143982,
"rewards/rollout_reward_func/std": 1.3439542055130005,
"sampling/importance_sampling_ratio/max": 2.4977619647979736,
"sampling/importance_sampling_ratio/mean": 0.8369683623313904,
"sampling/importance_sampling_ratio/min": 0.16656683385372162,
"sampling/sampling_logp_difference/max": 1.4865641593933105,
"sampling/sampling_logp_difference/mean": 0.08205842226743698,
"step": 13,
"step_time": 12.781198183001834
},
{
"clip_ratio/high_max": 0.01726190559566021,
"clip_ratio/high_mean": 0.008630952797830105,
"clip_ratio/low_mean": 0.02495265193283558,
"clip_ratio/low_min": 0.010416666977107525,
"clip_ratio/region_mean": 0.033583604730665684,
"entropy": 0.6469080410897732,
"epoch": 0.00014,
"grad_norm": 1.151137351989746,
"kl": 0.02960980085481424,
"learning_rate": 6.933333333333334e-06,
"loss": -0.0327,
"step": 14,
"step_time": 6.906088376998014
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1202.0,
"completions/max_terminated_length": 1202.0,
"completions/mean_length": 325.0625,
"completions/mean_terminated_length": 325.0625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.6098557002842426,
"epoch": 0.00015,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.4717327952384949,
"kl": 0.024435755418380722,
"learning_rate": 7.466666666666667e-06,
"loss": -0.0108,
"num_tokens": 374591.0,
"reward": 0.7930901050567627,
"reward_std": 0.8609839081764221,
"rewards/rollout_reward_func/mean": 0.7930901050567627,
"rewards/rollout_reward_func/std": 1.4151973724365234,
"sampling/importance_sampling_ratio/max": 1.0506209135055542,
"sampling/importance_sampling_ratio/mean": 0.8180942535400391,
"sampling/importance_sampling_ratio/min": 0.21363259851932526,
"sampling/sampling_logp_difference/max": 0.7379248142242432,
"sampling/sampling_logp_difference/mean": 0.058229509741067886,
"step": 15,
"step_time": 12.683557893002217
},
{
"clip_ratio/high_max": 0.009615384973585606,
"clip_ratio/high_mean": 0.004807692486792803,
"clip_ratio/low_mean": 0.022727273404598236,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02753496589139104,
"entropy": 0.5996548756957054,
"epoch": 0.00016,
"grad_norm": 0.4265834391117096,
"kl": 0.04038397324620746,
"learning_rate": 8e-06,
"loss": -0.0112,
"step": 16,
"step_time": 6.956079359999421
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 544.0,
"completions/max_terminated_length": 544.0,
"completions/mean_length": 171.625,
"completions/mean_terminated_length": 171.625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.644575547426939,
"epoch": 0.00017,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.8146066665649414,
"kl": 0.046186436113202944,
"learning_rate": 7.999999999907465e-06,
"loss": -0.007,
"num_tokens": 414601.0,
"reward": 1.4229528903961182,
"reward_std": 0.9664409160614014,
"rewards/rollout_reward_func/mean": 1.4229528903961182,
"rewards/rollout_reward_func/std": 1.3361284732818604,
"sampling/importance_sampling_ratio/max": 1.2239395380020142,
"sampling/importance_sampling_ratio/mean": 0.8816792964935303,
"sampling/importance_sampling_ratio/min": 0.2790951430797577,
"sampling/sampling_logp_difference/max": 0.7080600261688232,
"sampling/sampling_logp_difference/mean": 0.05067237466573715,
"step": 17,
"step_time": 10.178569630998027
},
{
"clip_ratio/high_max": 0.029513888992369175,
"clip_ratio/high_mean": 0.014756944496184587,
"clip_ratio/low_mean": 0.041666666977107525,
"clip_ratio/low_min": 0.02083333395421505,
"clip_ratio/region_mean": 0.05642361147329211,
"entropy": 0.6145340763032436,
"epoch": 0.00018,
"grad_norm": 0.23155571520328522,
"kl": 0.23800076835323125,
"learning_rate": 7.999999999629861e-06,
"loss": -0.0083,
"step": 18,
"step_time": 5.286352907998662
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1043.0,
"completions/max_terminated_length": 1043.0,
"completions/mean_length": 226.78125,
"completions/mean_terminated_length": 226.78125,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.475945807993412,
"epoch": 0.00019,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.6718065142631531,
"kl": 0.1313640770076745,
"learning_rate": 7.99999999916719e-06,
"loss": -0.0172,
"num_tokens": 456186.0,
"reward": 0.717463493347168,
"reward_std": 1.0493590831756592,
"rewards/rollout_reward_func/mean": 0.717463493347168,
"rewards/rollout_reward_func/std": 1.385160207748413,
"sampling/importance_sampling_ratio/max": 1.1975980997085571,
"sampling/importance_sampling_ratio/mean": 0.9053879380226135,
"sampling/importance_sampling_ratio/min": 0.5968481302261353,
"sampling/sampling_logp_difference/max": 0.49695074558258057,
"sampling/sampling_logp_difference/mean": 0.04921717196702957,
"step": 19,
"step_time": 11.41534333400341
},
{
"clip_ratio/high_max": 0.034722222946584225,
"clip_ratio/high_mean": 0.017361111473292112,
"clip_ratio/low_mean": 0.027777778450399637,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.04513888992369175,
"entropy": 0.45112011581659317,
"epoch": 0.0002,
"grad_norm": 0.26766785979270935,
"kl": 0.20291895651462255,
"learning_rate": 7.999999998519449e-06,
"loss": -0.0184,
"step": 20,
"step_time": 6.5778307339987805
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1094.0,
"completions/max_terminated_length": 1094.0,
"completions/mean_length": 417.625,
"completions/mean_terminated_length": 417.625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.750504732131958,
"epoch": 0.00021,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.421618103981018,
"kl": 0.1448975705425255,
"learning_rate": 7.999999997686637e-06,
"loss": -0.053,
"num_tokens": 508448.0,
"reward": 0.7684807181358337,
"reward_std": 1.7323917150497437,
"rewards/rollout_reward_func/mean": 0.7684807181358337,
"rewards/rollout_reward_func/std": 1.6796200275421143,
"sampling/importance_sampling_ratio/max": 1.4029730558395386,
"sampling/importance_sampling_ratio/mean": 0.7653356194496155,
"sampling/importance_sampling_ratio/min": 0.23433545231819153,
"sampling/sampling_logp_difference/max": 0.8456215858459473,
"sampling/sampling_logp_difference/mean": 0.08127377182245255,
"step": 21,
"step_time": 12.92976628100223
},
{
"clip_ratio/high_max": 0.012500000186264515,
"clip_ratio/high_mean": 0.0062500000931322575,
"clip_ratio/low_mean": 0.057641143910586834,
"clip_ratio/low_min": 0.025252525694668293,
"clip_ratio/region_mean": 0.06389114400371909,
"entropy": 0.7457476258277893,
"epoch": 0.00022,
"grad_norm": 2.050600528717041,
"kl": 1.61455141013721,
"learning_rate": 7.999999996668758e-06,
"loss": -0.0522,
"step": 22,
"step_time": 6.766588177999438
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1030.0,
"completions/max_terminated_length": 1030.0,
"completions/mean_length": 290.15625,
"completions/mean_terminated_length": 290.15625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.4916882663965225,
"epoch": 0.00023,
"frac_reward_zero_std": 0.25,
"grad_norm": 1.3506258726119995,
"kl": 1.069765329360962,
"learning_rate": 7.99999999546581e-06,
"loss": -0.0083,
"num_tokens": 555104.0,
"reward": 1.4062399864196777,
"reward_std": 1.1519603729248047,
"rewards/rollout_reward_func/mean": 1.4062399864196777,
"rewards/rollout_reward_func/std": 1.410425066947937,
"sampling/importance_sampling_ratio/max": 1.3898682594299316,
"sampling/importance_sampling_ratio/mean": 0.8012120723724365,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.8556509017944336,
"sampling/sampling_logp_difference/mean": 0.09328283369541168,
"step": 23,
"step_time": 11.852552100002868
}
],
"logging_steps": 1.0,
"max_steps": 400000,
"num_input_tokens_seen": 555104,
"num_train_epochs": 4,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}