Utah-multienv-GRxLP-fix / trainer_state.json
Gege24's picture
Upload task output 1
2194bfe verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.00112,
"eval_steps": 500,
"global_step": 56,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 262.0,
"completions/max_terminated_length": 262.0,
"completions/mean_length": 125.03125,
"completions/mean_terminated_length": 125.03125,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.45527722872793674,
"epoch": 2e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.013841008767485619,
"kl": 0.0,
"learning_rate": 0.0,
"loss": -0.0012,
"num_tokens": 51075.0,
"reward": 6.777899265289307,
"reward_std": 5.124541759490967,
"rewards/_dispatch_reward/mean": 6.777899265289307,
"rewards/_dispatch_reward/std": 5.124542236328125,
"sampling/importance_sampling_ratio/max": 1.0203018188476562,
"sampling/importance_sampling_ratio/mean": 0.5446840524673462,
"sampling/importance_sampling_ratio/min": 0.004089090973138809,
"sampling/sampling_logp_difference/max": 0.5533419251441956,
"sampling/sampling_logp_difference/mean": 0.023174326866865158,
"step": 1,
"step_time": 56.13741443500112
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 841.0,
"completions/max_terminated_length": 841.0,
"completions/mean_length": 245.53125,
"completions/mean_terminated_length": 245.53125,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.6149225234985352,
"epoch": 4e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.009881995618343353,
"kl": 0.0,
"learning_rate": 2.8571428571428575e-07,
"loss": -0.0059,
"num_tokens": 110688.0,
"reward": 4.6487836837768555,
"reward_std": 3.784931182861328,
"rewards/_dispatch_reward/mean": 4.6487836837768555,
"rewards/_dispatch_reward/std": 3.784930944442749,
"sampling/importance_sampling_ratio/max": 1.3708744049072266,
"sampling/importance_sampling_ratio/mean": 0.3464105427265167,
"sampling/importance_sampling_ratio/min": 8.993503541887549e-08,
"sampling/sampling_logp_difference/max": 0.6291780471801758,
"sampling/sampling_logp_difference/mean": 0.027994709089398384,
"step": 2,
"step_time": 87.9052765430024
},
{
"clip_ratio/high_max": 0.004685592517489567,
"clip_ratio/high_mean": 0.0026511843752814457,
"clip_ratio/low_mean": 0.0012970532843610272,
"clip_ratio/low_min": 0.0004810317768715322,
"clip_ratio/region_mean": 0.003948237717850134,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1309.0,
"completions/max_terminated_length": 1309.0,
"completions/mean_length": 262.6875,
"completions/mean_terminated_length": 262.6875,
"completions/min_length": 87.0,
"completions/min_terminated_length": 87.0,
"entropy": 0.6115306541323662,
"epoch": 6e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.01363975927233696,
"kl": 0.0011097499082097784,
"learning_rate": 5.714285714285715e-07,
"loss": 0.0041,
"num_tokens": 149708.0,
"reward": 5.756682395935059,
"reward_std": 2.871458053588867,
"rewards/_dispatch_reward/mean": 5.756682395935059,
"rewards/_dispatch_reward/std": 2.871457815170288,
"sampling/importance_sampling_ratio/max": 1.0003302097320557,
"sampling/importance_sampling_ratio/mean": 0.24685890972614288,
"sampling/importance_sampling_ratio/min": 1.2128718474002653e-08,
"sampling/sampling_logp_difference/max": 1.1710762977600098,
"sampling/sampling_logp_difference/mean": 0.024929963052272797,
"step": 3,
"step_time": 76.56850643599682
},
{
"clip_ratio/high_max": 0.005967892473563552,
"clip_ratio/high_mean": 0.0036123984755249694,
"clip_ratio/low_mean": 0.0005970557394903153,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0042094542295672,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1934.0,
"completions/mean_length": 384.65625,
"completions/mean_terminated_length": 331.0,
"completions/min_length": 100.0,
"completions/min_terminated_length": 100.0,
"entropy": 0.6012951210141182,
"epoch": 8e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.010854247957468033,
"kl": 0.001065619035216514,
"learning_rate": 8.571428571428572e-07,
"loss": -0.0005,
"num_tokens": 196257.0,
"reward": 7.588564872741699,
"reward_std": 1.0828258991241455,
"rewards/_dispatch_reward/mean": 7.588564872741699,
"rewards/_dispatch_reward/std": 1.082825779914856,
"sampling/importance_sampling_ratio/max": 0.6254830956459045,
"sampling/importance_sampling_ratio/mean": 0.21221274137496948,
"sampling/importance_sampling_ratio/min": 3.4579251862885876e-16,
"sampling/sampling_logp_difference/max": 0.9008774161338806,
"sampling/sampling_logp_difference/mean": 0.025733064860105515,
"step": 4,
"step_time": 105.42133834299966
},
{
"clip_ratio/high_max": 0.023003800350124948,
"clip_ratio/high_mean": 0.01184065386041766,
"clip_ratio/low_mean": 0.012614889892574865,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.024455543752992526,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1074.0,
"completions/max_terminated_length": 1074.0,
"completions/mean_length": 90.90625,
"completions/mean_terminated_length": 90.90625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.298396717524156,
"epoch": 0.0001,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.00900355540215969,
"kl": 0.14300901055321447,
"learning_rate": 1.142857142857143e-06,
"loss": -0.0006,
"num_tokens": 289297.0,
"reward": 2.042935848236084,
"reward_std": 6.4184794425964355,
"rewards/_dispatch_reward/mean": 2.042935848236084,
"rewards/_dispatch_reward/std": 6.418478965759277,
"sampling/importance_sampling_ratio/max": 1.389119029045105,
"sampling/importance_sampling_ratio/mean": 0.7724133729934692,
"sampling/importance_sampling_ratio/min": 1.1340963510519941e-06,
"sampling/sampling_logp_difference/max": 2.341449022293091,
"sampling/sampling_logp_difference/mean": 0.023405466228723526,
"step": 5,
"step_time": 76.50286699199569
},
{
"clip_ratio/high_max": 0.0006235100008780137,
"clip_ratio/high_mean": 0.00031175500043900684,
"clip_ratio/low_mean": 0.012735862444969825,
"clip_ratio/low_min": 9.578544268151745e-05,
"clip_ratio/region_mean": 0.013047617438132875,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1302.0,
"completions/max_terminated_length": 1302.0,
"completions/mean_length": 98.15625,
"completions/mean_terminated_length": 98.15625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.3092191433534026,
"epoch": 0.00012,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0037304293364286423,
"kl": 0.009563418556354009,
"learning_rate": 1.4285714285714286e-06,
"loss": -0.0002,
"num_tokens": 382323.0,
"reward": 1.780243992805481,
"reward_std": 5.718746185302734,
"rewards/_dispatch_reward/mean": 1.780243992805481,
"rewards/_dispatch_reward/std": 5.718745708465576,
"sampling/importance_sampling_ratio/max": 1.6135059595108032,
"sampling/importance_sampling_ratio/mean": 0.7582643032073975,
"sampling/importance_sampling_ratio/min": 7.616848124447628e-12,
"sampling/sampling_logp_difference/max": 0.5308667421340942,
"sampling/sampling_logp_difference/mean": 0.023645631968975067,
"step": 6,
"step_time": 66.5997183530053
},
{
"clip_ratio/high_max": 0.005657617410179228,
"clip_ratio/high_mean": 0.0030029033950995654,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0030029033950995654,
"completions/clipped_ratio": 0.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 729.0,
"completions/mean_length": 197.40625,
"completions/mean_terminated_length": 197.40625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.5808572247624397,
"epoch": 0.00014,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.009763856418430805,
"kl": 0.0010594904888421297,
"learning_rate": 1.7142857142857145e-06,
"loss": -0.0009,
"num_tokens": 438093.0,
"reward": 5.920393466949463,
"reward_std": 5.249653339385986,
"rewards/_dispatch_reward/mean": 5.920393466949463,
"rewards/_dispatch_reward/std": 5.249653339385986,
"sampling/importance_sampling_ratio/max": 1.0098344087600708,
"sampling/importance_sampling_ratio/mean": 0.3949565887451172,
"sampling/importance_sampling_ratio/min": 1.0556744811651697e-08,
"sampling/sampling_logp_difference/max": 0.6985321044921875,
"sampling/sampling_logp_difference/mean": 0.027950983494520187,
"step": 7,
"step_time": 69.09156412899756
},
{
"clip_ratio/high_max": 0.008222879550885409,
"clip_ratio/high_mean": 0.0050142133550252765,
"clip_ratio/low_mean": 0.012798447132809088,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.017812660371419042,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1242.0,
"completions/max_terminated_length": 1242.0,
"completions/mean_length": 259.34375,
"completions/mean_terminated_length": 259.34375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.5893099643290043,
"epoch": 0.00016,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.02151084505021572,
"kl": 0.0027830141116282903,
"learning_rate": 2.0000000000000003e-06,
"loss": -0.0011,
"num_tokens": 496400.0,
"reward": 5.134228229522705,
"reward_std": 4.130280017852783,
"rewards/_dispatch_reward/mean": 5.134228229522705,
"rewards/_dispatch_reward/std": 4.130280017852783,
"sampling/importance_sampling_ratio/max": 1.2578178644180298,
"sampling/importance_sampling_ratio/mean": 0.38606229424476624,
"sampling/importance_sampling_ratio/min": 1.0430401681249535e-10,
"sampling/sampling_logp_difference/max": 0.6706124544143677,
"sampling/sampling_logp_difference/mean": 0.027107033878564835,
"step": 8,
"step_time": 76.88476859900038
},
{
"clip_ratio/high_max": 0.0024189914984162897,
"clip_ratio/high_mean": 0.0012094957492081448,
"clip_ratio/low_mean": 0.0003834999370155856,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0015929956862237304,
"completions/clipped_ratio": 0.0,
"completions/max_length": 348.0,
"completions/max_terminated_length": 348.0,
"completions/mean_length": 123.34375,
"completions/mean_terminated_length": 123.34375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.34363649412989616,
"epoch": 0.00018,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.009796424768865108,
"kl": 0.0008698884230398107,
"learning_rate": 2.285714285714286e-06,
"loss": -0.0034,
"num_tokens": 570174.0,
"reward": 2.8205642700195312,
"reward_std": 4.669835090637207,
"rewards/_dispatch_reward/mean": 2.8205642700195312,
"rewards/_dispatch_reward/std": 4.669835090637207,
"sampling/importance_sampling_ratio/max": 1.1795650720596313,
"sampling/importance_sampling_ratio/mean": 0.605711042881012,
"sampling/importance_sampling_ratio/min": 6.102543557062745e-05,
"sampling/sampling_logp_difference/max": 0.43030834197998047,
"sampling/sampling_logp_difference/mean": 0.016010664403438568,
"step": 9,
"step_time": 68.84652510398882
},
{
"clip_ratio/high_max": 0.0027637388557195663,
"clip_ratio/high_mean": 0.0013818694278597832,
"clip_ratio/low_mean": 0.0009210727730533108,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.002302942215465009,
"completions/clipped_ratio": 0.0,
"completions/max_length": 322.0,
"completions/max_terminated_length": 322.0,
"completions/mean_length": 100.75,
"completions/mean_terminated_length": 100.75,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.3151314351707697,
"epoch": 0.0002,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.009586871601641178,
"kl": 0.0016707113827578723,
"learning_rate": 2.571428571428571e-06,
"loss": -0.0005,
"num_tokens": 645301.0,
"reward": 3.475369930267334,
"reward_std": 4.783686637878418,
"rewards/_dispatch_reward/mean": 3.475369930267334,
"rewards/_dispatch_reward/std": 4.783686637878418,
"sampling/importance_sampling_ratio/max": 1.1692392826080322,
"sampling/importance_sampling_ratio/mean": 0.6406182050704956,
"sampling/importance_sampling_ratio/min": 0.02848934382200241,
"sampling/sampling_logp_difference/max": 0.508894681930542,
"sampling/sampling_logp_difference/mean": 0.018476907163858414,
"step": 10,
"step_time": 70.24923908900018
},
{
"clip_ratio/high_max": 0.0052745313150808215,
"clip_ratio/high_mean": 0.0026372656575404108,
"clip_ratio/low_mean": 0.001209677429869771,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0038469430874101818,
"completions/clipped_ratio": 0.0,
"completions/max_length": 446.0,
"completions/max_terminated_length": 446.0,
"completions/mean_length": 106.84375,
"completions/mean_terminated_length": 106.84375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.3057017717510462,
"epoch": 0.00022,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.017719965428113937,
"kl": 0.0021114282753842417,
"learning_rate": 2.8571428571428573e-06,
"loss": -0.0001,
"num_tokens": 716206.0,
"reward": 2.0035197734832764,
"reward_std": 4.043484687805176,
"rewards/_dispatch_reward/mean": 2.0035197734832764,
"rewards/_dispatch_reward/std": 4.043484210968018,
"sampling/importance_sampling_ratio/max": 1.0895683765411377,
"sampling/importance_sampling_ratio/mean": 0.6331280469894409,
"sampling/importance_sampling_ratio/min": 0.0012601253110915422,
"sampling/sampling_logp_difference/max": 0.6603082418441772,
"sampling/sampling_logp_difference/mean": 0.01621483638882637,
"step": 11,
"step_time": 59.28811028199925
},
{
"clip_ratio/high_max": 0.0028494500438682735,
"clip_ratio/high_mean": 0.0014247250219341367,
"clip_ratio/low_mean": 0.0008933765202527866,
"clip_ratio/low_min": 0.00016066837997641414,
"clip_ratio/region_mean": 0.0023181015421869233,
"completions/clipped_ratio": 0.0,
"completions/max_length": 628.0,
"completions/max_terminated_length": 628.0,
"completions/mean_length": 205.625,
"completions/mean_terminated_length": 205.625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.6472973525524139,
"epoch": 0.00024,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.008480154909193516,
"kl": 0.0010127535788342357,
"learning_rate": 3.142857142857143e-06,
"loss": -0.0009,
"num_tokens": 776411.0,
"reward": 4.977667808532715,
"reward_std": 4.607911109924316,
"rewards/_dispatch_reward/mean": 4.977667808532715,
"rewards/_dispatch_reward/std": 4.607911109924316,
"sampling/importance_sampling_ratio/max": 1.0007600784301758,
"sampling/importance_sampling_ratio/mean": 0.34274715185165405,
"sampling/importance_sampling_ratio/min": 2.605641702402295e-09,
"sampling/sampling_logp_difference/max": 0.562023401260376,
"sampling/sampling_logp_difference/mean": 0.02857815846800804,
"step": 12,
"step_time": 78.00761527100258
},
{
"clip_ratio/high_max": 0.004665051761548966,
"clip_ratio/high_mean": 0.002332525880774483,
"clip_ratio/low_mean": 0.01103231159504503,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.013364837534027174,
"completions/clipped_ratio": 0.0,
"completions/max_length": 876.0,
"completions/max_terminated_length": 876.0,
"completions/mean_length": 121.75,
"completions/mean_terminated_length": 121.75,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.42481718584895134,
"epoch": 0.00026,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.008503826335072517,
"kl": 0.006206674090208253,
"learning_rate": 3.428571428571429e-06,
"loss": -0.0005,
"num_tokens": 848645.0,
"reward": 3.7999541759490967,
"reward_std": 5.260345935821533,
"rewards/_dispatch_reward/mean": 3.7999541759490967,
"rewards/_dispatch_reward/std": 5.260345458984375,
"sampling/importance_sampling_ratio/max": 1.3213924169540405,
"sampling/importance_sampling_ratio/mean": 0.7206687331199646,
"sampling/importance_sampling_ratio/min": 1.3760912906946032e-06,
"sampling/sampling_logp_difference/max": 0.5942957401275635,
"sampling/sampling_logp_difference/mean": 0.02436770685017109,
"step": 13,
"step_time": 65.29833886800043
},
{
"clip_ratio/high_max": 0.02439830679213628,
"clip_ratio/high_mean": 0.01219915339606814,
"clip_ratio/low_mean": 0.01281407053465955,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.025013223712448962,
"completions/clipped_ratio": 0.0,
"completions/max_length": 299.0,
"completions/max_terminated_length": 299.0,
"completions/mean_length": 102.25,
"completions/mean_terminated_length": 102.25,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.3535791225731373,
"epoch": 0.00028,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.007466800510883331,
"kl": 0.0010526334699534345,
"learning_rate": 3.7142857142857146e-06,
"loss": -0.003,
"num_tokens": 922414.0,
"reward": 3.58528208732605,
"reward_std": 5.264614582061768,
"rewards/_dispatch_reward/mean": 3.58528208732605,
"rewards/_dispatch_reward/std": 5.264614582061768,
"sampling/importance_sampling_ratio/max": 1.6783454418182373,
"sampling/importance_sampling_ratio/mean": 0.6591842770576477,
"sampling/importance_sampling_ratio/min": 0.015803860500454903,
"sampling/sampling_logp_difference/max": 0.6768133044242859,
"sampling/sampling_logp_difference/mean": 0.027988821268081665,
"step": 14,
"step_time": 76.04676585100606
},
{
"clip_ratio/high_max": 0.0046294865605887026,
"clip_ratio/high_mean": 0.0031611660815542564,
"clip_ratio/low_mean": 0.0009954585111699998,
"clip_ratio/low_min": 0.000405844155466184,
"clip_ratio/region_mean": 0.004156624592724256,
"completions/clipped_ratio": 0.0,
"completions/max_length": 564.0,
"completions/max_terminated_length": 564.0,
"completions/mean_length": 184.3125,
"completions/mean_terminated_length": 184.3125,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.5234995484352112,
"epoch": 0.0003,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.017105605453252792,
"kl": 0.001143711997428909,
"learning_rate": 4.000000000000001e-06,
"loss": 0.0024,
"num_tokens": 976803.0,
"reward": 4.643593788146973,
"reward_std": 3.711594820022583,
"rewards/_dispatch_reward/mean": 4.643593788146973,
"rewards/_dispatch_reward/std": 3.711594581604004,
"sampling/importance_sampling_ratio/max": 1.0112754106521606,
"sampling/importance_sampling_ratio/mean": 0.43820053339004517,
"sampling/importance_sampling_ratio/min": 0.0061927661299705505,
"sampling/sampling_logp_difference/max": 1.0779176950454712,
"sampling/sampling_logp_difference/mean": 0.023352596908807755,
"step": 15,
"step_time": 69.32589143500081
},
{
"clip_ratio/high_max": 0.0053526594856521115,
"clip_ratio/high_mean": 0.0026763297428260557,
"clip_ratio/low_mean": 0.001263205318537075,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0039395351050188765,
"completions/clipped_ratio": 0.0,
"completions/max_length": 361.0,
"completions/max_terminated_length": 361.0,
"completions/mean_length": 137.4375,
"completions/mean_terminated_length": 137.4375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.4956584945321083,
"epoch": 0.00032,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.010012968443334103,
"kl": 0.0016766081389505416,
"learning_rate": 4.2857142857142855e-06,
"loss": 0.0017,
"num_tokens": 1027131.0,
"reward": 6.254507064819336,
"reward_std": 4.845858097076416,
"rewards/_dispatch_reward/mean": 6.254507064819336,
"rewards/_dispatch_reward/std": 4.845858097076416,
"sampling/importance_sampling_ratio/max": 1.2621318101882935,
"sampling/importance_sampling_ratio/mean": 0.4852866530418396,
"sampling/importance_sampling_ratio/min": 0.0020480025559663773,
"sampling/sampling_logp_difference/max": 0.6868143081665039,
"sampling/sampling_logp_difference/mean": 0.024414777755737305,
"step": 16,
"step_time": 56.2167018509972
},
{
"clip_ratio/high_max": 0.004321954504121095,
"clip_ratio/high_mean": 0.0021609772520605475,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0021609772520605475,
"completions/clipped_ratio": 0.0,
"completions/max_length": 652.0,
"completions/max_terminated_length": 652.0,
"completions/mean_length": 64.6875,
"completions/mean_terminated_length": 64.6875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.2991622658446431,
"epoch": 0.00034,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.02793041430413723,
"kl": 0.003004775617228006,
"learning_rate": 4.571428571428572e-06,
"loss": -0.0019,
"num_tokens": 1116281.0,
"reward": 1.158271074295044,
"reward_std": 3.9837613105773926,
"rewards/_dispatch_reward/mean": 1.158271074295044,
"rewards/_dispatch_reward/std": 3.9837613105773926,
"sampling/importance_sampling_ratio/max": 1.7287894487380981,
"sampling/importance_sampling_ratio/mean": 0.8597855567932129,
"sampling/importance_sampling_ratio/min": 0.00014148977061267942,
"sampling/sampling_logp_difference/max": 0.5800625085830688,
"sampling/sampling_logp_difference/mean": 0.031770870089530945,
"step": 17,
"step_time": 45.771620833998895
},
{
"clip_ratio/high_max": 0.0039806214335840195,
"clip_ratio/high_mean": 0.0019903107167920098,
"clip_ratio/low_mean": 0.013759620025666663,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015749930713354843,
"completions/clipped_ratio": 0.0,
"completions/max_length": 973.0,
"completions/max_terminated_length": 973.0,
"completions/mean_length": 182.59375,
"completions/mean_terminated_length": 182.59375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.5259533487260342,
"epoch": 0.00036,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.008414385840296745,
"kl": 0.006829695048509166,
"learning_rate": 4.857142857142858e-06,
"loss": -0.0015,
"num_tokens": 1171273.0,
"reward": 5.743894100189209,
"reward_std": 4.690041542053223,
"rewards/_dispatch_reward/mean": 5.743894100189209,
"rewards/_dispatch_reward/std": 4.690041542053223,
"sampling/importance_sampling_ratio/max": 1.9703744649887085,
"sampling/importance_sampling_ratio/mean": 0.46085572242736816,
"sampling/importance_sampling_ratio/min": 1.293967510918037e-08,
"sampling/sampling_logp_difference/max": 0.6781671047210693,
"sampling/sampling_logp_difference/mean": 0.024906471371650696,
"step": 18,
"step_time": 61.14139282900214
},
{
"clip_ratio/high_max": 0.0030382057011593133,
"clip_ratio/high_mean": 0.00191759922017809,
"clip_ratio/low_mean": 0.0008816451372695155,
"clip_ratio/low_min": 0.0002367424312978983,
"clip_ratio/region_mean": 0.002799244350171648,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1660.0,
"completions/max_terminated_length": 1660.0,
"completions/mean_length": 186.5,
"completions/mean_terminated_length": 186.5,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.602867329493165,
"epoch": 0.00038,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.004653717391192913,
"kl": 0.0013299644269864075,
"learning_rate": 5.142857142857142e-06,
"loss": 0.0004,
"num_tokens": 1252413.0,
"reward": 2.5815396308898926,
"reward_std": 4.349632263183594,
"rewards/_dispatch_reward/mean": 2.5815396308898926,
"rewards/_dispatch_reward/std": 4.349632740020752,
"sampling/importance_sampling_ratio/max": 1.1002501249313354,
"sampling/importance_sampling_ratio/mean": 0.555901050567627,
"sampling/importance_sampling_ratio/min": 1.2933999649078487e-08,
"sampling/sampling_logp_difference/max": 0.869408369064331,
"sampling/sampling_logp_difference/mean": 0.02660653553903103,
"step": 19,
"step_time": 78.29009881900129
},
{
"clip_ratio/high_max": 0.0009124087519012392,
"clip_ratio/high_mean": 0.0004562043759506196,
"clip_ratio/low_mean": 0.0009879888530122116,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0014441932289628312,
"completions/clipped_ratio": 0.0,
"completions/max_length": 368.0,
"completions/max_terminated_length": 368.0,
"completions/mean_length": 73.71875,
"completions/mean_terminated_length": 73.71875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.33102226129267365,
"epoch": 0.0004,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.029436543583869934,
"kl": 0.0005448539421308851,
"learning_rate": 5.428571428571429e-06,
"loss": -0.009,
"num_tokens": 1351826.0,
"reward": 0.3658851385116577,
"reward_std": 2.878241539001465,
"rewards/_dispatch_reward/mean": 0.3658851385116577,
"rewards/_dispatch_reward/std": 2.878241539001465,
"sampling/importance_sampling_ratio/max": 1.82212495803833,
"sampling/importance_sampling_ratio/mean": 0.813861072063446,
"sampling/importance_sampling_ratio/min": 6.510071398224682e-05,
"sampling/sampling_logp_difference/max": 0.550661563873291,
"sampling/sampling_logp_difference/mean": 0.015052185393869877,
"step": 20,
"step_time": 56.76346912401641
},
{
"clip_ratio/high_max": 0.005187851551454514,
"clip_ratio/high_mean": 0.002695058472454548,
"clip_ratio/low_mean": 0.00019778481510002166,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0028928432147949934,
"completions/clipped_ratio": 0.0,
"completions/max_length": 497.0,
"completions/max_terminated_length": 497.0,
"completions/mean_length": 108.03125,
"completions/mean_terminated_length": 108.03125,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.37717200443148613,
"epoch": 0.00042,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.009999923408031464,
"kl": 0.000724673747754423,
"learning_rate": 5.7142857142857145e-06,
"loss": -0.0028,
"num_tokens": 1436552.0,
"reward": 2.89748477935791,
"reward_std": 4.333184719085693,
"rewards/_dispatch_reward/mean": 2.89748477935791,
"rewards/_dispatch_reward/std": 4.333184719085693,
"sampling/importance_sampling_ratio/max": 1.1284852027893066,
"sampling/importance_sampling_ratio/mean": 0.6845414638519287,
"sampling/importance_sampling_ratio/min": 0.00010813102562678978,
"sampling/sampling_logp_difference/max": 0.4671330451965332,
"sampling/sampling_logp_difference/mean": 0.023374546319246292,
"step": 21,
"step_time": 54.53796494498965
},
{
"clip_ratio/high_max": 0.07083333469927311,
"clip_ratio/high_mean": 0.035416667349636555,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.04583333432674408,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.75,
"completions/mean_terminated_length": 2.75,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.07059835561085492,
"epoch": 0.00044,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.002399139106273651,
"kl": 0.0038920448471913005,
"learning_rate": 6e-06,
"loss": 0.0,
"num_tokens": 1575800.0,
"reward": -1.2599544525146484,
"reward_std": 0.19062143564224243,
"rewards/_dispatch_reward/mean": -1.2599544525146484,
"rewards/_dispatch_reward/std": 0.19062143564224243,
"sampling/importance_sampling_ratio/max": 1.1356183290481567,
"sampling/importance_sampling_ratio/mean": 0.9610987901687622,
"sampling/importance_sampling_ratio/min": 0.41064921021461487,
"sampling/sampling_logp_difference/max": 0.6516157388687134,
"sampling/sampling_logp_difference/mean": 0.024187467992305756,
"step": 22,
"step_time": 46.77752613399207
},
{
"clip_ratio/high_max": 0.005154712242074311,
"clip_ratio/high_mean": 0.0035003781085833907,
"clip_ratio/low_mean": 0.0005952381179668009,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004095616226550192,
"completions/clipped_ratio": 0.0,
"completions/max_length": 351.0,
"completions/max_terminated_length": 351.0,
"completions/mean_length": 105.1875,
"completions/mean_terminated_length": 105.1875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.410700709791854,
"epoch": 0.00046,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.014338094741106033,
"kl": 0.0008683733431098517,
"learning_rate": 6.285714285714286e-06,
"loss": -0.0041,
"num_tokens": 1671271.0,
"reward": 3.2939820289611816,
"reward_std": 4.406397342681885,
"rewards/_dispatch_reward/mean": 3.2939820289611816,
"rewards/_dispatch_reward/std": 4.406397342681885,
"sampling/importance_sampling_ratio/max": 1.676399827003479,
"sampling/importance_sampling_ratio/mean": 0.6958481073379517,
"sampling/importance_sampling_ratio/min": 0.01673131436109543,
"sampling/sampling_logp_difference/max": 0.9975869655609131,
"sampling/sampling_logp_difference/mean": 0.022911131381988525,
"step": 23,
"step_time": 63.07633655099198
},
{
"clip_ratio/high_max": 0.005110417871037498,
"clip_ratio/high_mean": 0.002555208935518749,
"clip_ratio/low_mean": 0.0005854800692759454,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0031406890047946945,
"completions/clipped_ratio": 0.0,
"completions/max_length": 367.0,
"completions/max_terminated_length": 367.0,
"completions/mean_length": 92.78125,
"completions/mean_terminated_length": 92.78125,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.28824638947844505,
"epoch": 0.00048,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.013156220316886902,
"kl": 0.0007490101343137212,
"learning_rate": 6.571428571428572e-06,
"loss": -0.0012,
"num_tokens": 1770738.0,
"reward": 2.9093494415283203,
"reward_std": 4.354193210601807,
"rewards/_dispatch_reward/mean": 2.9093494415283203,
"rewards/_dispatch_reward/std": 4.354193210601807,
"sampling/importance_sampling_ratio/max": 1.6265262365341187,
"sampling/importance_sampling_ratio/mean": 0.6805465221405029,
"sampling/importance_sampling_ratio/min": 0.0026759125757962465,
"sampling/sampling_logp_difference/max": 0.4296213388442993,
"sampling/sampling_logp_difference/mean": 0.012809459120035172,
"step": 24,
"step_time": 67.15237878700282
},
{
"clip_ratio/high_max": 0.007100892311427742,
"clip_ratio/high_mean": 0.004247377044521272,
"clip_ratio/low_mean": 0.0008122937870211899,
"clip_ratio/low_min": 0.0005247487570159137,
"clip_ratio/region_mean": 0.005059670831542462,
"completions/clipped_ratio": 0.0,
"completions/max_length": 480.0,
"completions/max_terminated_length": 480.0,
"completions/mean_length": 178.03125,
"completions/mean_terminated_length": 178.03125,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.583958063274622,
"epoch": 0.0005,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.009483549743890762,
"kl": 0.00117611356836278,
"learning_rate": 6.857142857142858e-06,
"loss": -0.0058,
"num_tokens": 1843514.0,
"reward": 5.987891674041748,
"reward_std": 4.206455230712891,
"rewards/_dispatch_reward/mean": 5.987891674041748,
"rewards/_dispatch_reward/std": 4.206455230712891,
"sampling/importance_sampling_ratio/max": 1.2595504522323608,
"sampling/importance_sampling_ratio/mean": 0.4059632420539856,
"sampling/importance_sampling_ratio/min": 1.8272161241839058e-06,
"sampling/sampling_logp_difference/max": 0.9262754917144775,
"sampling/sampling_logp_difference/mean": 0.02728903479874134,
"step": 25,
"step_time": 72.96448353200685
},
{
"clip_ratio/high_max": 0.006182103767059743,
"clip_ratio/high_mean": 0.0037271984620019794,
"clip_ratio/low_mean": 0.0003773117423406802,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004104510211618617,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1298.0,
"completions/max_terminated_length": 1298.0,
"completions/mean_length": 235.71875,
"completions/mean_terminated_length": 235.71875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.5523776076734066,
"epoch": 0.00052,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.007155085913836956,
"kl": 0.0011198167048860341,
"learning_rate": 7.1428571428571436e-06,
"loss": -0.0005,
"num_tokens": 1920482.0,
"reward": 5.081435680389404,
"reward_std": 4.353846073150635,
"rewards/_dispatch_reward/mean": 5.081435680389404,
"rewards/_dispatch_reward/std": 4.353846073150635,
"sampling/importance_sampling_ratio/max": 1.0659159421920776,
"sampling/importance_sampling_ratio/mean": 0.38374269008636475,
"sampling/importance_sampling_ratio/min": 6.281219468162735e-12,
"sampling/sampling_logp_difference/max": 0.8278406858444214,
"sampling/sampling_logp_difference/mean": 0.027728049084544182,
"step": 26,
"step_time": 98.4568334949945
},
{
"clip_ratio/high_max": 0.008924347494030371,
"clip_ratio/high_mean": 0.004462173747015186,
"clip_ratio/low_mean": 0.0015605021035298705,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006022675821441226,
"completions/clipped_ratio": 0.0,
"completions/max_length": 284.0,
"completions/max_terminated_length": 284.0,
"completions/mean_length": 85.5625,
"completions/mean_terminated_length": 85.5625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.33285857360169757,
"epoch": 0.00054,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.010880419053137302,
"kl": 0.0011197539679415058,
"learning_rate": 7.428571428571429e-06,
"loss": 0.0032,
"num_tokens": 2024608.0,
"reward": 3.6858019828796387,
"reward_std": 5.164435386657715,
"rewards/_dispatch_reward/mean": 3.6858019828796387,
"rewards/_dispatch_reward/std": 5.164435386657715,
"sampling/importance_sampling_ratio/max": 1.2996236085891724,
"sampling/importance_sampling_ratio/mean": 0.732099175453186,
"sampling/importance_sampling_ratio/min": 0.03455425426363945,
"sampling/sampling_logp_difference/max": 0.9168803691864014,
"sampling/sampling_logp_difference/mean": 0.024052917957305908,
"step": 27,
"step_time": 58.99345494199952
},
{
"clip_ratio/high_max": 0.0029420487117022276,
"clip_ratio/high_mean": 0.0014710243558511138,
"clip_ratio/low_mean": 0.000155472633196041,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0016264969890471548,
"completions/clipped_ratio": 0.0,
"completions/max_length": 272.0,
"completions/max_terminated_length": 272.0,
"completions/mean_length": 44.65625,
"completions/mean_terminated_length": 44.65625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.17506308463089226,
"epoch": 0.00056,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.007396813947707415,
"kl": 0.00044269679888486735,
"learning_rate": 7.714285714285716e-06,
"loss": 0.0008,
"num_tokens": 2174423.0,
"reward": 1.1599459648132324,
"reward_std": 3.9757213592529297,
"rewards/_dispatch_reward/mean": 1.1599459648132324,
"rewards/_dispatch_reward/std": 3.975721597671509,
"sampling/importance_sampling_ratio/max": 1.213560938835144,
"sampling/importance_sampling_ratio/mean": 0.8505507707595825,
"sampling/importance_sampling_ratio/min": 0.03090948425233364,
"sampling/sampling_logp_difference/max": 0.5677609443664551,
"sampling/sampling_logp_difference/mean": 0.012984106317162514,
"step": 28,
"step_time": 68.44782462599687
},
{
"clip_ratio/high_max": 0.004619319050107151,
"clip_ratio/high_mean": 0.0025808092032093555,
"clip_ratio/low_mean": 0.0016296468093059957,
"clip_ratio/low_min": 0.00032637076219543815,
"clip_ratio/region_mean": 0.004210455983411521,
"completions/clipped_ratio": 0.0,
"completions/max_length": 364.0,
"completions/max_terminated_length": 364.0,
"completions/mean_length": 186.84375,
"completions/mean_terminated_length": 186.84375,
"completions/min_length": 104.0,
"completions/min_terminated_length": 104.0,
"entropy": 0.5688229724764824,
"epoch": 0.00058,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.018515318632125854,
"kl": 0.0013667424791492522,
"learning_rate": 8.000000000000001e-06,
"loss": -0.004,
"num_tokens": 2210913.0,
"reward": 7.494377136230469,
"reward_std": 1.6056934595108032,
"rewards/_dispatch_reward/mean": 7.494377136230469,
"rewards/_dispatch_reward/std": 1.6056934595108032,
"sampling/importance_sampling_ratio/max": 2.078017473220825,
"sampling/importance_sampling_ratio/mean": 0.331025093793869,
"sampling/importance_sampling_ratio/min": 0.004665270447731018,
"sampling/sampling_logp_difference/max": 0.5339968204498291,
"sampling/sampling_logp_difference/mean": 0.023812616243958473,
"step": 29,
"step_time": 64.19203726600244
},
{
"clip_ratio/high_max": 0.026041667093522847,
"clip_ratio/high_mean": 0.013467262091580778,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.013467262091580778,
"completions/clipped_ratio": 0.0,
"completions/max_length": 208.0,
"completions/max_terminated_length": 208.0,
"completions/mean_length": 37.59375,
"completions/mean_terminated_length": 37.59375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.183818920282647,
"epoch": 0.0006,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0071673765778541565,
"kl": 0.001944019940860242,
"learning_rate": 8.285714285714287e-06,
"loss": -0.0012,
"num_tokens": 2367899.0,
"reward": 1.659142255783081,
"reward_std": 5.015908718109131,
"rewards/_dispatch_reward/mean": 1.659142255783081,
"rewards/_dispatch_reward/std": 5.015908241271973,
"sampling/importance_sampling_ratio/max": 1.345532774925232,
"sampling/importance_sampling_ratio/mean": 0.8848145604133606,
"sampling/importance_sampling_ratio/min": 0.18236808478832245,
"sampling/sampling_logp_difference/max": 0.40476560592651367,
"sampling/sampling_logp_difference/mean": 0.010478072799742222,
"step": 30,
"step_time": 69.87089692599693
},
{
"clip_ratio/high_max": 0.006121268626884557,
"clip_ratio/high_mean": 0.0030606343134422787,
"clip_ratio/low_mean": 0.0012486299383454025,
"clip_ratio/low_min": 0.0006476683774963021,
"clip_ratio/region_mean": 0.004309264237235766,
"completions/clipped_ratio": 0.0,
"completions/max_length": 613.0,
"completions/max_terminated_length": 613.0,
"completions/mean_length": 164.5625,
"completions/mean_terminated_length": 164.5625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.5194896943867207,
"epoch": 0.00062,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.01138492301106453,
"kl": 0.0011847462155856192,
"learning_rate": 8.571428571428571e-06,
"loss": -0.0062,
"num_tokens": 2451108.0,
"reward": 4.4887261390686035,
"reward_std": 3.793579578399658,
"rewards/_dispatch_reward/mean": 4.4887261390686035,
"rewards/_dispatch_reward/std": 3.7935798168182373,
"sampling/importance_sampling_ratio/max": 1.4072153568267822,
"sampling/importance_sampling_ratio/mean": 0.44676199555397034,
"sampling/importance_sampling_ratio/min": 2.846842835424468e-05,
"sampling/sampling_logp_difference/max": 0.8780875205993652,
"sampling/sampling_logp_difference/mean": 0.025232184678316116,
"step": 31,
"step_time": 88.69532321900624
},
{
"clip_ratio/high_max": 0.001911135099362582,
"clip_ratio/high_mean": 0.000955567549681291,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01137223420664668,
"completions/clipped_ratio": 0.0,
"completions/max_length": 385.0,
"completions/max_terminated_length": 385.0,
"completions/mean_length": 59.125,
"completions/mean_terminated_length": 59.125,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.24192802239849698,
"epoch": 0.00064,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.004926626570522785,
"kl": 0.010344951566121807,
"learning_rate": 8.857142857142858e-06,
"loss": -0.0018,
"num_tokens": 2635061.0,
"reward": 0.8189454078674316,
"reward_std": 3.5781614780426025,
"rewards/_dispatch_reward/mean": 0.8189454078674316,
"rewards/_dispatch_reward/std": 3.5781612396240234,
"sampling/importance_sampling_ratio/max": 1.4798288345336914,
"sampling/importance_sampling_ratio/mean": 0.8191201686859131,
"sampling/importance_sampling_ratio/min": 0.011969967745244503,
"sampling/sampling_logp_difference/max": 0.9833095073699951,
"sampling/sampling_logp_difference/mean": 0.027340415865182877,
"step": 32,
"step_time": 77.33599497902469
},
{
"clip_ratio/high_max": 0.005233386065810919,
"clip_ratio/high_mean": 0.003829600813332945,
"clip_ratio/low_mean": 0.0018207434732175898,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005650344141031383,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1172.0,
"completions/max_terminated_length": 1172.0,
"completions/mean_length": 197.6875,
"completions/mean_terminated_length": 197.6875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.47484813444316387,
"epoch": 0.00066,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.009056804701685905,
"kl": 0.0013335544645087793,
"learning_rate": 9.142857142857144e-06,
"loss": -0.0028,
"num_tokens": 2719758.0,
"reward": 5.25334358215332,
"reward_std": 4.086261749267578,
"rewards/_dispatch_reward/mean": 5.25334358215332,
"rewards/_dispatch_reward/std": 4.086261749267578,
"sampling/importance_sampling_ratio/max": 1.2484526634216309,
"sampling/importance_sampling_ratio/mean": 0.5199525952339172,
"sampling/importance_sampling_ratio/min": 2.673025863153544e-12,
"sampling/sampling_logp_difference/max": 0.5819270610809326,
"sampling/sampling_logp_difference/mean": 0.026656974107027054,
"step": 33,
"step_time": 99.14059063902096
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.875,
"completions/mean_terminated_length": 2.875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.0005853353350175894,
"epoch": 0.00068,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.8580616092076525e-05,
"kl": 6.113201492796705e-07,
"learning_rate": 9.42857142857143e-06,
"loss": -0.0,
"num_tokens": 2961120.0,
"reward": -1.053731918334961,
"reward_std": 0.5529842376708984,
"rewards/_dispatch_reward/mean": -1.053731918334961,
"rewards/_dispatch_reward/std": 0.5529841780662537,
"sampling/importance_sampling_ratio/max": 0.999992847442627,
"sampling/importance_sampling_ratio/mean": 0.9999217987060547,
"sampling/importance_sampling_ratio/min": 0.9980286955833435,
"sampling/sampling_logp_difference/max": 0.001961317379027605,
"sampling/sampling_logp_difference/mean": 2.823377690219786e-05,
"step": 34,
"step_time": 82.58755905900762
},
{
"clip_ratio/high_max": 0.008132743969326839,
"clip_ratio/high_mean": 0.004193663233309053,
"clip_ratio/low_mean": 0.0010917649779003114,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005285428196657449,
"completions/clipped_ratio": 0.0,
"completions/max_length": 540.0,
"completions/max_terminated_length": 540.0,
"completions/mean_length": 130.53125,
"completions/mean_terminated_length": 130.53125,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.43125689217959007,
"epoch": 0.0007,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.007139499299228191,
"kl": 0.0011299467951175757,
"learning_rate": 9.714285714285715e-06,
"loss": -0.0014,
"num_tokens": 3094548.0,
"reward": 2.2313008308410645,
"reward_std": 3.8178205490112305,
"rewards/_dispatch_reward/mean": 2.2313008308410645,
"rewards/_dispatch_reward/std": 3.8178205490112305,
"sampling/importance_sampling_ratio/max": 1.6531254053115845,
"sampling/importance_sampling_ratio/mean": 0.6061273813247681,
"sampling/importance_sampling_ratio/min": 0.0009113638079725206,
"sampling/sampling_logp_difference/max": 0.5405864715576172,
"sampling/sampling_logp_difference/mean": 0.024779872968792915,
"step": 35,
"step_time": 89.02141472401127
},
{
"clip_ratio/high_max": 0.004321634216466919,
"clip_ratio/high_mean": 0.0026319201424485072,
"clip_ratio/low_mean": 0.000901452629477717,
"clip_ratio/low_min": 0.00013440860493574291,
"clip_ratio/region_mean": 0.003533372830133885,
"completions/clipped_ratio": 0.0,
"completions/max_length": 573.0,
"completions/max_terminated_length": 573.0,
"completions/mean_length": 242.1875,
"completions/mean_terminated_length": 242.1875,
"completions/min_length": 119.0,
"completions/min_terminated_length": 119.0,
"entropy": 0.6462938487529755,
"epoch": 0.00072,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.007205578498542309,
"kl": 0.0016471190610900521,
"learning_rate": 1e-05,
"loss": -0.0014,
"num_tokens": 3134609.0,
"reward": 9.145380020141602,
"reward_std": 3.903296709060669,
"rewards/_dispatch_reward/mean": 9.145380020141602,
"rewards/_dispatch_reward/std": 3.903296947479248,
"sampling/importance_sampling_ratio/max": 0.6258358955383301,
"sampling/importance_sampling_ratio/mean": 0.14388859272003174,
"sampling/importance_sampling_ratio/min": 2.2541855742019834e-06,
"sampling/sampling_logp_difference/max": 0.8020744323730469,
"sampling/sampling_logp_difference/mean": 0.026974039152264595,
"step": 36,
"step_time": 76.58179467700393
},
{
"clip_ratio/high_max": 0.005726933144615032,
"clip_ratio/high_mean": 0.003037077702174429,
"clip_ratio/low_mean": 0.0002093808216159232,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0032464585310663097,
"completions/clipped_ratio": 0.0,
"completions/max_length": 635.0,
"completions/max_terminated_length": 635.0,
"completions/mean_length": 200.1875,
"completions/mean_terminated_length": 200.1875,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.5727038569748402,
"epoch": 0.00074,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.02006140537559986,
"kl": 0.001625359283934813,
"learning_rate": 9.998815709812376e-06,
"loss": -0.0058,
"num_tokens": 3238256.0,
"reward": 6.689815521240234,
"reward_std": 5.825323581695557,
"rewards/_dispatch_reward/mean": 6.689815521240234,
"rewards/_dispatch_reward/std": 5.825323581695557,
"sampling/importance_sampling_ratio/max": 1.6268012523651123,
"sampling/importance_sampling_ratio/mean": 0.39891862869262695,
"sampling/importance_sampling_ratio/min": 1.152023969552829e-06,
"sampling/sampling_logp_difference/max": 0.7040293216705322,
"sampling/sampling_logp_difference/mean": 0.028268281370401382,
"step": 37,
"step_time": 109.18746044600266
},
{
"clip_ratio/high_max": 0.0074598678620532155,
"clip_ratio/high_mean": 0.0037299339310266078,
"clip_ratio/low_mean": 0.00028669723542407155,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004016631166450679,
"completions/clipped_ratio": 0.0,
"completions/max_length": 215.0,
"completions/max_terminated_length": 215.0,
"completions/mean_length": 37.78125,
"completions/mean_terminated_length": 37.78125,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.17505362825068005,
"epoch": 0.00076,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.007274220697581768,
"kl": 0.0007693237591865909,
"learning_rate": 9.995263587272567e-06,
"loss": -0.0012,
"num_tokens": 3419269.0,
"reward": 1.2304946184158325,
"reward_std": 3.969721794128418,
"rewards/_dispatch_reward/mean": 1.2304946184158325,
"rewards/_dispatch_reward/std": 3.969721794128418,
"sampling/importance_sampling_ratio/max": 1.1863714456558228,
"sampling/importance_sampling_ratio/mean": 0.8759454488754272,
"sampling/importance_sampling_ratio/min": 0.1944216936826706,
"sampling/sampling_logp_difference/max": 0.5156512260437012,
"sampling/sampling_logp_difference/mean": 0.012628378346562386,
"step": 38,
"step_time": 83.71088723700086
},
{
"clip_ratio/high_max": 0.0044490770233096555,
"clip_ratio/high_mean": 0.0022245385116548277,
"clip_ratio/low_mean": 0.00031115004821913317,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.002535688523494173,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 837.0,
"completions/mean_length": 279.5625,
"completions/mean_terminated_length": 222.51612854003906,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.4930221550166607,
"epoch": 0.00078,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.006831540260463953,
"kl": 0.0012955782149219885,
"learning_rate": 9.989345875977304e-06,
"loss": -0.0025,
"num_tokens": 3522270.0,
"reward": 4.621129989624023,
"reward_std": 3.7266364097595215,
"rewards/_dispatch_reward/mean": 4.621129989624023,
"rewards/_dispatch_reward/std": 3.7266364097595215,
"sampling/importance_sampling_ratio/max": 1.00277578830719,
"sampling/importance_sampling_ratio/mean": 0.3822411894798279,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 0.8860452175140381,
"sampling/sampling_logp_difference/mean": 0.02683982066810131,
"step": 39,
"step_time": 115.52355178999278
},
{
"clip_ratio/high_max": 0.002568041512859054,
"clip_ratio/high_mean": 0.001284020756429527,
"clip_ratio/low_mean": 0.0001502403902122751,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001434261146641802,
"completions/clipped_ratio": 0.0,
"completions/max_length": 519.0,
"completions/max_terminated_length": 519.0,
"completions/mean_length": 138.53125,
"completions/mean_terminated_length": 138.53125,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.33654895424842834,
"epoch": 0.0008,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.004731995053589344,
"kl": 0.0011872488248627633,
"learning_rate": 9.981066313679877e-06,
"loss": -0.0039,
"num_tokens": 3674673.0,
"reward": 3.011484146118164,
"reward_std": 4.585468769073486,
"rewards/_dispatch_reward/mean": 3.011484146118164,
"rewards/_dispatch_reward/std": 4.585468292236328,
"sampling/importance_sampling_ratio/max": 1.0014727115631104,
"sampling/importance_sampling_ratio/mean": 0.5896348357200623,
"sampling/importance_sampling_ratio/min": 7.688206096645445e-05,
"sampling/sampling_logp_difference/max": 0.5600206851959229,
"sampling/sampling_logp_difference/mean": 0.013642973266541958,
"step": 40,
"step_time": 124.35661156599963
},
{
"clip_ratio/high_max": 0.004234651743900031,
"clip_ratio/high_mean": 0.0021173258719500154,
"clip_ratio/low_mean": 0.0009653382294345647,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00308266410138458,
"completions/clipped_ratio": 0.0,
"completions/max_length": 230.0,
"completions/max_terminated_length": 230.0,
"completions/mean_length": 49.09375,
"completions/mean_terminated_length": 49.09375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.22011170587938977,
"epoch": 0.00082,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.006720908917486668,
"kl": 0.0008953095966717228,
"learning_rate": 9.970430129929293e-06,
"loss": 0.0,
"num_tokens": 3885396.0,
"reward": 0.8941321969032288,
"reward_std": 3.445835590362549,
"rewards/_dispatch_reward/mean": 0.8941321969032288,
"rewards/_dispatch_reward/std": 3.445835828781128,
"sampling/importance_sampling_ratio/max": 1.0005685091018677,
"sampling/importance_sampling_ratio/mean": 0.8388948440551758,
"sampling/importance_sampling_ratio/min": 0.09574428200721741,
"sampling/sampling_logp_difference/max": 0.5382180213928223,
"sampling/sampling_logp_difference/mean": 0.012245646677911282,
"step": 41,
"step_time": 92.02000498400594
},
{
"clip_ratio/high_max": 0.0019035532604902983,
"clip_ratio/high_mean": 0.0009517766302451491,
"clip_ratio/low_mean": 0.001191068033222109,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.002142844663467258,
"completions/clipped_ratio": 0.0,
"completions/max_length": 252.0,
"completions/max_terminated_length": 252.0,
"completions/mean_length": 45.8125,
"completions/mean_terminated_length": 45.8125,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.18138621716934722,
"epoch": 0.00084,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.00852085743099451,
"kl": 0.0006909975757783404,
"learning_rate": 9.957444042767179e-06,
"loss": -0.0004,
"num_tokens": 4092383.0,
"reward": 1.431800127029419,
"reward_std": 4.547163009643555,
"rewards/_dispatch_reward/mean": 1.431800127029419,
"rewards/_dispatch_reward/std": 4.547163009643555,
"sampling/importance_sampling_ratio/max": 1.0209298133850098,
"sampling/importance_sampling_ratio/mean": 0.856784462928772,
"sampling/importance_sampling_ratio/min": 0.06062455102801323,
"sampling/sampling_logp_difference/max": 0.38622474670410156,
"sampling/sampling_logp_difference/mean": 0.011158864945173264,
"step": 42,
"step_time": 87.67940832000022
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.9375,
"completions/mean_terminated_length": 2.9375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.0008112995728879469,
"epoch": 0.00086,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.608729871804826e-05,
"kl": 8.94069701606881e-09,
"learning_rate": 9.942116254484521e-06,
"loss": -0.0,
"num_tokens": 4377341.0,
"reward": -1.2443116903305054,
"reward_std": 0.2223791629076004,
"rewards/_dispatch_reward/mean": -1.2443116903305054,
"rewards/_dispatch_reward/std": 0.2223791480064392,
"sampling/importance_sampling_ratio/max": 1.0007485151290894,
"sampling/importance_sampling_ratio/mean": 1.0000085830688477,
"sampling/importance_sampling_ratio/min": 0.999955415725708,
"sampling/sampling_logp_difference/max": 0.0007260828278958797,
"sampling/sampling_logp_difference/mean": 1.4709847164340317e-05,
"step": 43,
"step_time": 99.83407927100052
},
{
"clip_ratio/high_max": 0.0033617592125665396,
"clip_ratio/high_mean": 0.0016808796062832698,
"clip_ratio/low_mean": 0.00013185653369873762,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0018127361399820074,
"completions/clipped_ratio": 0.0,
"completions/max_length": 270.0,
"completions/max_terminated_length": 270.0,
"completions/mean_length": 48.59375,
"completions/mean_terminated_length": 48.59375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.18187530830618925,
"epoch": 0.00088,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.008785068057477474,
"kl": 0.001074353451258503,
"learning_rate": 9.924456446440927e-06,
"loss": 0.0,
"num_tokens": 4595054.0,
"reward": 0.8892784118652344,
"reward_std": 3.5477054119110107,
"rewards/_dispatch_reward/mean": 0.8892784118652344,
"rewards/_dispatch_reward/std": 3.5477051734924316,
"sampling/importance_sampling_ratio/max": 0.9999911785125732,
"sampling/importance_sampling_ratio/mean": 0.8556162714958191,
"sampling/importance_sampling_ratio/min": 0.2585143446922302,
"sampling/sampling_logp_difference/max": 0.4261045455932617,
"sampling/sampling_logp_difference/mean": 0.011128053069114685,
"step": 44,
"step_time": 90.33923576699453
},
{
"clip_ratio/high_max": 0.005055090092355385,
"clip_ratio/high_mean": 0.0025275450461776927,
"clip_ratio/low_mean": 0.010712175630033016,
"clip_ratio/low_min": 0.0005910165491513908,
"clip_ratio/region_mean": 0.013239720676210709,
"completions/clipped_ratio": 0.0,
"completions/max_length": 310.0,
"completions/max_terminated_length": 310.0,
"completions/mean_length": 98.625,
"completions/mean_terminated_length": 98.625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.2906395886093378,
"epoch": 0.0009,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.007414973806589842,
"kl": 0.0019741024589166045,
"learning_rate": 9.904475772949665e-06,
"loss": -0.0038,
"num_tokens": 4752696.0,
"reward": 3.3065011501312256,
"reward_std": 4.428520679473877,
"rewards/_dispatch_reward/mean": 3.3065011501312256,
"rewards/_dispatch_reward/std": 4.4285197257995605,
"sampling/importance_sampling_ratio/max": 1.4952305555343628,
"sampling/importance_sampling_ratio/mean": 0.6812023520469666,
"sampling/importance_sampling_ratio/min": 0.004352574236690998,
"sampling/sampling_logp_difference/max": 0.5085635185241699,
"sampling/sampling_logp_difference/mean": 0.01729891076683998,
"step": 45,
"step_time": 94.67123420500138
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.84375,
"completions/mean_terminated_length": 2.84375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.006393522209691582,
"epoch": 0.00092,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.00017838655912782997,
"kl": 0.00011525210235774352,
"learning_rate": 9.882186854232367e-06,
"loss": -0.0,
"num_tokens": 5033021.0,
"reward": -1.1816649436950684,
"reward_std": 0.19231897592544556,
"rewards/_dispatch_reward/mean": -1.1816649436950684,
"rewards/_dispatch_reward/std": 0.19231897592544556,
"sampling/importance_sampling_ratio/max": 1.0462512969970703,
"sampling/importance_sampling_ratio/mean": 1.001424789428711,
"sampling/importance_sampling_ratio/min": 0.9997555613517761,
"sampling/sampling_logp_difference/max": 0.046176210045814514,
"sampling/sampling_logp_difference/mean": 0.0005228003719821572,
"step": 46,
"step_time": 95.09760556400579
},
{
"clip_ratio/high_max": 0.002622804546263069,
"clip_ratio/high_mean": 0.0015204323863144964,
"clip_ratio/low_mean": 0.0007803065163898282,
"clip_ratio/low_min": 0.0005091649945825338,
"clip_ratio/region_mean": 0.0023007388808764517,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1091.0,
"completions/max_terminated_length": 1091.0,
"completions/mean_length": 204.28125,
"completions/mean_terminated_length": 204.28125,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.5276281349360943,
"epoch": 0.00094,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.0020519369281828403,
"kl": 0.0022789838985772803,
"learning_rate": 9.857603768447822e-06,
"loss": -0.0,
"num_tokens": 5126262.0,
"reward": 5.915221214294434,
"reward_std": 5.048460483551025,
"rewards/_dispatch_reward/mean": 5.915221214294434,
"rewards/_dispatch_reward/std": 5.048460006713867,
"sampling/importance_sampling_ratio/max": 1.0008785724639893,
"sampling/importance_sampling_ratio/mean": 0.3696955740451813,
"sampling/importance_sampling_ratio/min": 9.062721240127303e-09,
"sampling/sampling_logp_difference/max": 0.7791495323181152,
"sampling/sampling_logp_difference/mean": 0.02858079969882965,
"step": 47,
"step_time": 101.12222818900045
},
{
"clip_ratio/high_max": 0.0029862732626497746,
"clip_ratio/high_mean": 0.0016634363564662635,
"clip_ratio/low_mean": 0.0008538897527614608,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0025173261019517668,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 748.0,
"completions/mean_length": 175.1875,
"completions/mean_terminated_length": 114.7741928100586,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.46684860810637474,
"epoch": 0.00096,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.026494067162275314,
"kl": 0.0018744520202744752,
"learning_rate": 9.830742042799913e-06,
"loss": -0.0039,
"num_tokens": 5286798.0,
"reward": 2.8440439701080322,
"reward_std": 4.178732872009277,
"rewards/_dispatch_reward/mean": 2.8440439701080322,
"rewards/_dispatch_reward/std": 4.178732872009277,
"sampling/importance_sampling_ratio/max": 1.0188325643539429,
"sampling/importance_sampling_ratio/mean": 0.6148038506507874,
"sampling/importance_sampling_ratio/min": 1.9325112621260132e-09,
"sampling/sampling_logp_difference/max": 0.4544100761413574,
"sampling/sampling_logp_difference/mean": 0.022511716932058334,
"step": 48,
"step_time": 122.34452678399975
},
{
"clip_ratio/high_max": 0.004316396691137925,
"clip_ratio/high_mean": 0.0037547791725955904,
"clip_ratio/low_mean": 0.0005475876678247005,
"clip_ratio/low_min": 0.00035511364694684744,
"clip_ratio/region_mean": 0.004302366898627952,
"completions/clipped_ratio": 0.0,
"completions/max_length": 424.0,
"completions/max_terminated_length": 424.0,
"completions/mean_length": 143.21875,
"completions/mean_terminated_length": 143.21875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.47758977860212326,
"epoch": 0.00098,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.01684115268290043,
"kl": 0.002730547290411778,
"learning_rate": 9.801618643730292e-06,
"loss": -0.0002,
"num_tokens": 5387272.0,
"reward": 7.64976692199707,
"reward_std": 5.87539529800415,
"rewards/_dispatch_reward/mean": 7.64976692199707,
"rewards/_dispatch_reward/std": 5.87539529800415,
"sampling/importance_sampling_ratio/max": 1.6533435583114624,
"sampling/importance_sampling_ratio/mean": 0.514319121837616,
"sampling/importance_sampling_ratio/min": 0.0009031257359310985,
"sampling/sampling_logp_difference/max": 1.2167229652404785,
"sampling/sampling_logp_difference/mean": 0.026262516155838966,
"step": 49,
"step_time": 91.79330294199463
},
{
"clip_ratio/high_max": 0.006119212484918535,
"clip_ratio/high_mean": 0.004732819041237235,
"clip_ratio/low_mean": 0.0008056239748839289,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0055384429870173335,
"completions/clipped_ratio": 0.0,
"completions/max_length": 253.0,
"completions/max_terminated_length": 253.0,
"completions/mean_length": 87.78125,
"completions/mean_terminated_length": 87.78125,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.3011600449681282,
"epoch": 0.001,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.01269652508199215,
"kl": 0.0032866905676200986,
"learning_rate": 9.770251966202029e-06,
"loss": -0.0006,
"num_tokens": 5541768.0,
"reward": 3.141866683959961,
"reward_std": 4.4505181312561035,
"rewards/_dispatch_reward/mean": 3.141866683959961,
"rewards/_dispatch_reward/std": 4.4505181312561035,
"sampling/importance_sampling_ratio/max": 1.649505853652954,
"sampling/importance_sampling_ratio/mean": 0.7044443488121033,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 0.5605548620223999,
"sampling/sampling_logp_difference/mean": 0.021160298958420753,
"step": 50,
"step_time": 98.10317065600248
},
{
"clip_ratio/high_max": 0.010189392720349133,
"clip_ratio/high_mean": 0.006060676998458803,
"clip_ratio/low_mean": 0.0006750412285327911,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006735718212439679,
"completions/clipped_ratio": 0.0,
"completions/max_length": 446.0,
"completions/max_terminated_length": 446.0,
"completions/mean_length": 139.96875,
"completions/mean_terminated_length": 139.96875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.442113908007741,
"epoch": 0.00102,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.013396586291491985,
"kl": 0.0034333480580244213,
"learning_rate": 9.736661822080943e-06,
"loss": -0.0064,
"num_tokens": 5627206.0,
"reward": 5.226400852203369,
"reward_std": 3.826179027557373,
"rewards/_dispatch_reward/mean": 5.226400852203369,
"rewards/_dispatch_reward/std": 3.826178789138794,
"sampling/importance_sampling_ratio/max": 1.497654676437378,
"sampling/importance_sampling_ratio/mean": 0.5884408950805664,
"sampling/importance_sampling_ratio/min": 0.004016702529042959,
"sampling/sampling_logp_difference/max": 1.028564691543579,
"sampling/sampling_logp_difference/mean": 0.022922225296497345,
"step": 51,
"step_time": 86.19129810099548
},
{
"clip_ratio/high_max": 0.006017926352797076,
"clip_ratio/high_mean": 0.0040219257498392835,
"clip_ratio/low_mean": 0.000564306574233342,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004586232345900498,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 442.0,
"completions/mean_length": 205.0625,
"completions/mean_terminated_length": 145.61289978027344,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.5015206560492516,
"epoch": 0.00104,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.013659187592566013,
"kl": 0.002838586748111993,
"learning_rate": 9.700869427622014e-06,
"loss": -0.0009,
"num_tokens": 5733605.0,
"reward": 6.685307502746582,
"reward_std": 5.804135799407959,
"rewards/_dispatch_reward/mean": 6.685307502746582,
"rewards/_dispatch_reward/std": 5.804135322570801,
"sampling/importance_sampling_ratio/max": 1.0001060962677002,
"sampling/importance_sampling_ratio/mean": 0.49562621116638184,
"sampling/importance_sampling_ratio/min": 3.8747170124079266e-16,
"sampling/sampling_logp_difference/max": 0.5828416347503662,
"sampling/sampling_logp_difference/mean": 0.023624923080205917,
"step": 52,
"step_time": 113.47440205099701
},
{
"clip_ratio/high_max": 0.003565533785149455,
"clip_ratio/high_mean": 0.002294554462423548,
"clip_ratio/low_mean": 0.0002680120160221122,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0025625665148254484,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1867.0,
"completions/max_terminated_length": 1867.0,
"completions/mean_length": 298.15625,
"completions/mean_terminated_length": 298.15625,
"completions/min_length": 89.0,
"completions/min_terminated_length": 89.0,
"entropy": 0.6001667827367783,
"epoch": 0.00106,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.006183857098221779,
"kl": 0.0034337107208557427,
"learning_rate": 9.662897390068735e-06,
"loss": -0.0012,
"num_tokens": 5776086.0,
"reward": 9.917367935180664,
"reward_std": 3.750269651412964,
"rewards/_dispatch_reward/mean": 9.917367935180664,
"rewards/_dispatch_reward/std": 3.7502694129943848,
"sampling/importance_sampling_ratio/max": 0.7825208902359009,
"sampling/importance_sampling_ratio/mean": 0.22198987007141113,
"sampling/importance_sampling_ratio/min": 2.639378380566923e-14,
"sampling/sampling_logp_difference/max": 0.6501812934875488,
"sampling/sampling_logp_difference/mean": 0.02595674991607666,
"step": 53,
"step_time": 92.85906081800931
},
{
"clip_ratio/high_max": 0.0028834628174081445,
"clip_ratio/high_mean": 0.0014417314087040722,
"clip_ratio/low_mean": 0.0007313800670090131,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0021731114975409582,
"completions/clipped_ratio": 0.0,
"completions/max_length": 909.0,
"completions/max_terminated_length": 909.0,
"completions/mean_length": 281.71875,
"completions/mean_terminated_length": 281.71875,
"completions/min_length": 149.0,
"completions/min_terminated_length": 149.0,
"entropy": 0.6317033842206001,
"epoch": 0.00108,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.008015364408493042,
"kl": 0.004350849922047928,
"learning_rate": 9.622769693373892e-06,
"loss": 0.0,
"num_tokens": 5815865.0,
"reward": 6.637714862823486,
"reward_std": 1.3837485313415527,
"rewards/_dispatch_reward/mean": 6.637714862823486,
"rewards/_dispatch_reward/std": 1.3837485313415527,
"sampling/importance_sampling_ratio/max": 0.5636690258979797,
"sampling/importance_sampling_ratio/mean": 0.1337074339389801,
"sampling/importance_sampling_ratio/min": 1.958301396598472e-07,
"sampling/sampling_logp_difference/max": 1.2753636837005615,
"sampling/sampling_logp_difference/mean": 0.027083205059170723,
"step": 54,
"step_time": 70.00975042999198
},
{
"clip_ratio/high_max": 0.00682789774145931,
"clip_ratio/high_mean": 0.003413948870729655,
"clip_ratio/low_mean": 0.0004340277810115367,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0038479766517411917,
"completions/clipped_ratio": 0.0,
"completions/max_length": 322.0,
"completions/max_terminated_length": 322.0,
"completions/mean_length": 102.90625,
"completions/mean_terminated_length": 102.90625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.39303808473050594,
"epoch": 0.0011,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.007574125658720732,
"kl": 0.00406463113904465,
"learning_rate": 9.580511683050793e-06,
"loss": -0.0003,
"num_tokens": 5976662.0,
"reward": 3.517942428588867,
"reward_std": 4.828723430633545,
"rewards/_dispatch_reward/mean": 3.517942428588867,
"rewards/_dispatch_reward/std": 4.828723907470703,
"sampling/importance_sampling_ratio/max": 0.9999936819076538,
"sampling/importance_sampling_ratio/mean": 0.5975947380065918,
"sampling/importance_sampling_ratio/min": 0.006443084683269262,
"sampling/sampling_logp_difference/max": 1.088891625404358,
"sampling/sampling_logp_difference/mean": 0.024115335196256638,
"step": 55,
"step_time": 89.8712716250011
},
{
"clip_ratio/high_max": 0.002959180681500584,
"clip_ratio/high_mean": 0.001479590340750292,
"clip_ratio/low_mean": 0.001389213364745956,
"clip_ratio/low_min": 0.0003709198790602386,
"clip_ratio/region_mean": 0.0028688037127722055,
"completions/clipped_ratio": 0.0,
"completions/max_length": 655.0,
"completions/max_terminated_length": 655.0,
"completions/mean_length": 142.5625,
"completions/mean_terminated_length": 142.5625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.5603683553636074,
"epoch": 0.00112,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.006758315023034811,
"kl": 0.002921493010944687,
"learning_rate": 9.536150050164488e-06,
"loss": -0.0016,
"num_tokens": 6141561.0,
"reward": 2.3384432792663574,
"reward_std": 3.822960376739502,
"rewards/_dispatch_reward/mean": 2.3384432792663574,
"rewards/_dispatch_reward/std": 3.822960376739502,
"sampling/importance_sampling_ratio/max": 1.0247381925582886,
"sampling/importance_sampling_ratio/mean": 0.5408236980438232,
"sampling/importance_sampling_ratio/min": 2.8452670903789112e-06,
"sampling/sampling_logp_difference/max": 0.9780905246734619,
"sampling/sampling_logp_difference/mean": 0.028260692954063416,
"step": 56,
"step_time": 100.65156877000118
}
],
"logging_steps": 1.0,
"max_steps": 160,
"num_input_tokens_seen": 6141561,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}