environment_test_affine_gin_rummy / trainer_state.json
bimabk's picture
Upload task output 1
7d7510f verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.0009000360014400576,
"eval_steps": 500,
"global_step": 15,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0416666679084301,
"completions/max_length": 2393.0,
"completions/max_terminated_length": 2393.0,
"completions/mean_length": 2078.5,
"completions/mean_terminated_length": 2086.78271484375,
"completions/min_length": 694.0,
"completions/min_terminated_length": 694.0,
"entropy": 0.33678532888491947,
"epoch": 6.000240009600384e-05,
"frac_reward_zero_std": 0.5,
"grad_norm": 2.966360330581665,
"kl": 0.0,
"learning_rate": 0.0,
"loss": 0.083,
"num_tokens": 70871.0,
"reward": 0.01833334192633629,
"reward_std": 0.30440181493759155,
"rewards/rollout_reward_func/mean": 0.018333343788981438,
"rewards/rollout_reward_func/std": 0.5022702217102051,
"sampling/importance_sampling_ratio/max": 1.498870849609375,
"sampling/importance_sampling_ratio/mean": 0.9928643107414246,
"sampling/importance_sampling_ratio/min": 1.5878707237959588e-09,
"sampling/sampling_logp_difference/max": 21.133813858032227,
"sampling/sampling_logp_difference/mean": 0.04281112551689148,
"step": 1,
"step_time": 15.020983219003028
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0416666679084301,
"completions/max_length": 2388.0,
"completions/max_terminated_length": 2388.0,
"completions/mean_length": 2032.4583740234375,
"completions/mean_terminated_length": 2088.0,
"completions/min_length": 755.0,
"completions/min_terminated_length": 1434.0,
"entropy": 0.33894892781972885,
"epoch": 0.00012000480019200768,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.8514909744262695,
"kl": 0.0,
"learning_rate": 8.571428571428571e-07,
"loss": -0.0637,
"num_tokens": 140234.0,
"reward": 0.01833333633840084,
"reward_std": 0.3617991805076599,
"rewards/rollout_reward_func/mean": 0.01833333633840084,
"rewards/rollout_reward_func/std": 0.41583511233329773,
"sampling/importance_sampling_ratio/max": 1.5429106950759888,
"sampling/importance_sampling_ratio/mean": 0.9241917133331299,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 7.0373640060424805,
"sampling/sampling_logp_difference/mean": 0.03344937413930893,
"step": 2,
"step_time": 13.70249634799984
},
{
"clip_ratio/high_max": 0.002923976629972458,
"clip_ratio/high_mean": 0.002923976629972458,
"clip_ratio/low_mean": 0.001366120142241319,
"clip_ratio/low_min": 0.001366120142241319,
"clip_ratio/region_mean": 0.004290096772213777,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2396.0,
"completions/max_terminated_length": 2396.0,
"completions/mean_length": 2183.791748046875,
"completions/mean_terminated_length": 2183.791748046875,
"completions/min_length": 676.0,
"completions/min_terminated_length": 676.0,
"entropy": 0.3009852096438408,
"epoch": 0.00018000720028801153,
"frac_reward_zero_std": 0.5,
"grad_norm": 2.9021389484405518,
"kl": 0.0008198164481048783,
"learning_rate": 1.7142857142857143e-06,
"loss": 0.0103,
"num_tokens": 213894.0,
"reward": 0.2187500298023224,
"reward_std": 0.16249999403953552,
"rewards/rollout_reward_func/mean": 0.2187500149011612,
"rewards/rollout_reward_func/std": 0.27593812346458435,
"sampling/importance_sampling_ratio/max": 1.6837977170944214,
"sampling/importance_sampling_ratio/mean": 1.0721747875213623,
"sampling/importance_sampling_ratio/min": 0.5105805397033691,
"sampling/sampling_logp_difference/max": 0.7568278312683105,
"sampling/sampling_logp_difference/mean": 0.017255382612347603,
"step": 3,
"step_time": 13.48628649100101
},
{
"clip_ratio/high_max": 0.002923976629972458,
"clip_ratio/high_mean": 0.002923976629972458,
"clip_ratio/low_mean": 0.0010162601247429848,
"clip_ratio/low_min": 0.0010162601247429848,
"clip_ratio/region_mean": 0.003940236754715443,
"completions/clipped_ratio": 0.0416666679084301,
"completions/max_length": 2407.0,
"completions/max_terminated_length": 2407.0,
"completions/mean_length": 2167.5,
"completions/mean_terminated_length": 2163.95654296875,
"completions/min_length": 751.0,
"completions/min_terminated_length": 751.0,
"entropy": 0.26422637701034546,
"epoch": 0.00024000960038401536,
"frac_reward_zero_std": 0.1666666716337204,
"grad_norm": 3.117863178253174,
"kl": 0.0008683214109623805,
"learning_rate": 2.5714285714285716e-06,
"loss": 0.0319,
"num_tokens": 286603.0,
"reward": 0.25833338499069214,
"reward_std": 0.17131535708904266,
"rewards/rollout_reward_func/mean": 0.25833335518836975,
"rewards/rollout_reward_func/std": 0.2583167254924774,
"sampling/importance_sampling_ratio/max": 1.4256991147994995,
"sampling/importance_sampling_ratio/mean": 0.969955325126648,
"sampling/importance_sampling_ratio/min": 0.5823519825935364,
"sampling/sampling_logp_difference/max": 0.329906702041626,
"sampling/sampling_logp_difference/mean": 0.015135754831135273,
"step": 4,
"step_time": 12.97427468299793
},
{
"clip_ratio/high_max": 0.0011415525029102962,
"clip_ratio/high_mean": 0.0011415525029102962,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0011415525029102962,
"completions/clipped_ratio": 0.0416666679084301,
"completions/max_length": 2393.0,
"completions/max_terminated_length": 2393.0,
"completions/mean_length": 1980.166748046875,
"completions/mean_terminated_length": 1983.9130859375,
"completions/min_length": 667.0,
"completions/min_terminated_length": 667.0,
"entropy": 0.2581128428379695,
"epoch": 0.0003000120004800192,
"frac_reward_zero_std": 0.3333333432674408,
"grad_norm": 2.8420257568359375,
"kl": 0.0006481069843478812,
"learning_rate": 3.4285714285714285e-06,
"loss": 0.0613,
"num_tokens": 355097.0,
"reward": 0.04541667923331261,
"reward_std": 0.38386815786361694,
"rewards/rollout_reward_func/mean": 0.04541667178273201,
"rewards/rollout_reward_func/std": 0.4871387481689453,
"sampling/importance_sampling_ratio/max": 1.2240864038467407,
"sampling/importance_sampling_ratio/mean": 0.9118739366531372,
"sampling/importance_sampling_ratio/min": 2.346623517723856e-08,
"sampling/sampling_logp_difference/max": 9.750813484191895,
"sampling/sampling_logp_difference/mean": 0.05021395534276962,
"step": 5,
"step_time": 13.251750441997501
},
{
"clip_ratio/high_max": 0.0021307161853959164,
"clip_ratio/high_mean": 0.0021307161853959164,
"clip_ratio/low_mean": 0.0034020394862939916,
"clip_ratio/low_min": 0.0034020394862939916,
"clip_ratio/region_mean": 0.005532755671689908,
"completions/clipped_ratio": 0.0416666679084301,
"completions/max_length": 2471.0,
"completions/max_terminated_length": 2396.0,
"completions/mean_length": 1981.5833740234375,
"completions/mean_terminated_length": 1960.304443359375,
"completions/min_length": 669.0,
"completions/min_terminated_length": 669.0,
"entropy": 0.388533021012942,
"epoch": 0.00036001440057602306,
"frac_reward_zero_std": 0.1666666716337204,
"grad_norm": 3.518169403076172,
"kl": 0.0009793323115445673,
"learning_rate": 4.2857142857142855e-06,
"loss": -0.1467,
"num_tokens": 423828.0,
"reward": 0.06583334505558014,
"reward_std": 0.41378408670425415,
"rewards/rollout_reward_func/mean": 0.06583333760499954,
"rewards/rollout_reward_func/std": 0.450168251991272,
"sampling/importance_sampling_ratio/max": 1.502521276473999,
"sampling/importance_sampling_ratio/mean": 1.0047988891601562,
"sampling/importance_sampling_ratio/min": 0.00031536107417196035,
"sampling/sampling_logp_difference/max": 6.730105876922607,
"sampling/sampling_logp_difference/mean": 0.027479952201247215,
"step": 6,
"step_time": 13.971570022000378
},
{
"clip_ratio/high_max": 0.0014492752961814404,
"clip_ratio/high_mean": 0.0014492752961814404,
"clip_ratio/low_mean": 0.0009009009227156639,
"clip_ratio/low_min": 0.0009009009227156639,
"clip_ratio/region_mean": 0.0023501762188971043,
"completions/clipped_ratio": 0.0416666679084301,
"completions/max_length": 2321.0,
"completions/max_terminated_length": 2321.0,
"completions/mean_length": 2112.416748046875,
"completions/mean_terminated_length": 2113.652099609375,
"completions/min_length": 1386.0,
"completions/min_terminated_length": 1386.0,
"entropy": 0.2739458481470744,
"epoch": 0.0004200168006720269,
"frac_reward_zero_std": 0.3333333432674408,
"grad_norm": 2.717536449432373,
"kl": 0.0010390299139544368,
"learning_rate": 5.142857142857143e-06,
"loss": -0.1854,
"num_tokens": 495731.0,
"reward": 0.15916667878627777,
"reward_std": 0.1548699289560318,
"rewards/rollout_reward_func/mean": 0.15916667878627777,
"rewards/rollout_reward_func/std": 0.32958361506462097,
"sampling/importance_sampling_ratio/max": 1.5678151845932007,
"sampling/importance_sampling_ratio/mean": 0.9848524928092957,
"sampling/importance_sampling_ratio/min": 2.720875045270077e-06,
"sampling/sampling_logp_difference/max": 9.624576568603516,
"sampling/sampling_logp_difference/mean": 0.03278766945004463,
"step": 7,
"step_time": 12.972795774996484
},
{
"clip_ratio/high_max": 0.0037208329886198044,
"clip_ratio/high_mean": 0.0037208329886198044,
"clip_ratio/low_mean": 0.0018710837854693334,
"clip_ratio/low_min": 0.0018710837854693334,
"clip_ratio/region_mean": 0.005591916696478923,
"completions/clipped_ratio": 0.125,
"completions/max_length": 2392.0,
"completions/max_terminated_length": 2392.0,
"completions/mean_length": 2101.416748046875,
"completions/mean_terminated_length": 2181.381103515625,
"completions/min_length": 811.0,
"completions/min_terminated_length": 1519.0,
"entropy": 0.38193629682064056,
"epoch": 0.0004800192007680307,
"frac_reward_zero_std": 0.1666666716337204,
"grad_norm": 4.98358678817749,
"kl": 0.001539110904559493,
"learning_rate": 6e-06,
"loss": 0.0265,
"num_tokens": 567094.0,
"reward": 0.01291667204350233,
"reward_std": 0.4608933925628662,
"rewards/rollout_reward_func/mean": 0.01291667204350233,
"rewards/rollout_reward_func/std": 0.49684640765190125,
"sampling/importance_sampling_ratio/max": 1.5221177339553833,
"sampling/importance_sampling_ratio/mean": 0.9741430282592773,
"sampling/importance_sampling_ratio/min": 0.37359893321990967,
"sampling/sampling_logp_difference/max": 0.40885448455810547,
"sampling/sampling_logp_difference/mean": 0.01908516138792038,
"step": 8,
"step_time": 13.575062246001835
},
{
"clip_ratio/high_max": 0.0010482179932296276,
"clip_ratio/high_mean": 0.0010482179932296276,
"clip_ratio/low_mean": 0.0026881719628969827,
"clip_ratio/low_min": 0.0026881719628969827,
"clip_ratio/region_mean": 0.0037363899561266103,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2403.0,
"completions/max_terminated_length": 2403.0,
"completions/mean_length": 2160.666748046875,
"completions/mean_terminated_length": 2160.666748046875,
"completions/min_length": 854.0,
"completions/min_terminated_length": 854.0,
"entropy": 0.2373982494076093,
"epoch": 0.0005400216008640345,
"frac_reward_zero_std": 0.3333333432674408,
"grad_norm": 4.026190280914307,
"kl": 0.001170102283746625,
"learning_rate": 6.857142857142857e-06,
"loss": 0.2258,
"num_tokens": 639900.0,
"reward": 0.2250000238418579,
"reward_std": 0.157368004322052,
"rewards/rollout_reward_func/mean": 0.2250000238418579,
"rewards/rollout_reward_func/std": 0.26091811060905457,
"sampling/importance_sampling_ratio/max": 1.3226174116134644,
"sampling/importance_sampling_ratio/mean": 0.9841742515563965,
"sampling/importance_sampling_ratio/min": 3.137126449999188e-14,
"sampling/sampling_logp_difference/max": 19.257625579833984,
"sampling/sampling_logp_difference/mean": 0.0487070232629776,
"step": 9,
"step_time": 14.116852209996068
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2421.0,
"completions/max_terminated_length": 2421.0,
"completions/mean_length": 1925.8333740234375,
"completions/mean_terminated_length": 1925.8333740234375,
"completions/min_length": 400.0,
"completions/min_terminated_length": 400.0,
"entropy": 0.2908450166384379,
"epoch": 0.0006000240009600384,
"frac_reward_zero_std": 0.1666666716337204,
"grad_norm": 2.877199172973633,
"kl": 0.0020954393645903715,
"learning_rate": 7.714285714285714e-06,
"loss": 0.1587,
"num_tokens": 706860.0,
"reward": 0.16597223281860352,
"reward_std": 0.26020175218582153,
"rewards/rollout_reward_func/mean": 0.16597223281860352,
"rewards/rollout_reward_func/std": 0.42211174964904785,
"sampling/importance_sampling_ratio/max": 1.2571473121643066,
"sampling/importance_sampling_ratio/mean": 0.8514933586120605,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 12.186629295349121,
"sampling/sampling_logp_difference/mean": 0.057263195514678955,
"step": 10,
"step_time": 1903.418363845998
},
{
"clip_ratio/high_max": 0.0009633911152680715,
"clip_ratio/high_mean": 0.0009633911152680715,
"clip_ratio/low_mean": 0.002207505516707897,
"clip_ratio/low_min": 0.002207505516707897,
"clip_ratio/region_mean": 0.003170896631975969,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2416.0,
"completions/max_terminated_length": 2416.0,
"completions/mean_length": 1997.4583740234375,
"completions/mean_terminated_length": 1997.4583740234375,
"completions/min_length": 650.0,
"completions/min_terminated_length": 650.0,
"entropy": 0.2779182270169258,
"epoch": 0.0006600264010560422,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.155987739562988,
"kl": 0.003926942406299834,
"learning_rate": 8.571428571428571e-06,
"loss": -0.151,
"num_tokens": 775448.0,
"reward": 0.20642617344856262,
"reward_std": 0.413322776556015,
"rewards/rollout_reward_func/mean": 0.20642615854740143,
"rewards/rollout_reward_func/std": 0.4893973767757416,
"sampling/importance_sampling_ratio/max": 1.5803664922714233,
"sampling/importance_sampling_ratio/mean": 0.9162782430648804,
"sampling/importance_sampling_ratio/min": 1.3544125465614343e-07,
"sampling/sampling_logp_difference/max": 10.99795150756836,
"sampling/sampling_logp_difference/mean": 0.03617515414953232,
"step": 11,
"step_time": 1941.150138971001
},
{
"clip_ratio/high_max": 0.001883239174882571,
"clip_ratio/high_mean": 0.001883239174882571,
"clip_ratio/low_mean": 0.0011037527583539486,
"clip_ratio/low_min": 0.0011037527583539486,
"clip_ratio/region_mean": 0.0029869919332365194,
"completions/clipped_ratio": 0.0416666679084301,
"completions/max_length": 2480.0,
"completions/max_terminated_length": 2480.0,
"completions/mean_length": 1994.3333740234375,
"completions/mean_terminated_length": 1990.95654296875,
"completions/min_length": 741.0,
"completions/min_terminated_length": 741.0,
"entropy": 0.2965584595998128,
"epoch": 0.0007200288011520461,
"frac_reward_zero_std": 0.1666666716337204,
"grad_norm": 2.91935133934021,
"kl": 0.006807463922693084,
"learning_rate": 9.428571428571428e-06,
"loss": 0.2746,
"num_tokens": 843904.0,
"reward": 0.19167006015777588,
"reward_std": 0.2366676777601242,
"rewards/rollout_reward_func/mean": 0.19167006015777588,
"rewards/rollout_reward_func/std": 0.3872174620628357,
"sampling/importance_sampling_ratio/max": 1.4269827604293823,
"sampling/importance_sampling_ratio/mean": 0.9134854078292847,
"sampling/importance_sampling_ratio/min": 4.090348326712956e-08,
"sampling/sampling_logp_difference/max": 7.874401569366455,
"sampling/sampling_logp_difference/mean": 0.035650916397571564,
"step": 12,
"step_time": 1983.2984636460005
},
{
"clip_ratio/high_max": 0.002906976888577143,
"clip_ratio/high_mean": 0.002906976888577143,
"clip_ratio/low_mean": 0.0014880953046182792,
"clip_ratio/low_min": 0.0014880953046182792,
"clip_ratio/region_mean": 0.004395072193195422,
"completions/clipped_ratio": 0.0416666679084301,
"completions/max_length": 2377.0,
"completions/max_terminated_length": 2377.0,
"completions/mean_length": 1990.75,
"completions/mean_terminated_length": 1974.2174072265625,
"completions/min_length": 818.0,
"completions/min_terminated_length": 818.0,
"entropy": 0.24792559693257013,
"epoch": 0.00078003120124805,
"frac_reward_zero_std": 0.1666666716337204,
"grad_norm": 2.9418699741363525,
"kl": 0.007263694618207713,
"learning_rate": 1.0285714285714286e-05,
"loss": 0.3781,
"num_tokens": 912501.0,
"reward": 0.2585095167160034,
"reward_std": 0.2756158113479614,
"rewards/rollout_reward_func/mean": 0.25850948691368103,
"rewards/rollout_reward_func/std": 0.37159210443496704,
"sampling/importance_sampling_ratio/max": 1.6105681657791138,
"sampling/importance_sampling_ratio/mean": 0.9647274017333984,
"sampling/importance_sampling_ratio/min": 6.621257142569448e-08,
"sampling/sampling_logp_difference/max": 10.260540008544922,
"sampling/sampling_logp_difference/mean": 0.05481240525841713,
"step": 13,
"step_time": 2070.256960532004
},
{
"clip_ratio/high_max": 0.0006485084304586053,
"clip_ratio/high_mean": 0.0006485084304586053,
"clip_ratio/low_mean": 0.0014430014416575432,
"clip_ratio/low_min": 0.0014430014416575432,
"clip_ratio/region_mean": 0.0020915098721161485,
"completions/clipped_ratio": 0.0416666679084301,
"completions/max_length": 2370.0,
"completions/max_terminated_length": 2370.0,
"completions/mean_length": 2130.916748046875,
"completions/mean_terminated_length": 2126.652099609375,
"completions/min_length": 1223.0,
"completions/min_terminated_length": 1223.0,
"entropy": 0.16453668102622032,
"epoch": 0.0008400336013440538,
"frac_reward_zero_std": 0.3333333432674408,
"grad_norm": 1.9463223218917847,
"kl": 0.014017233702664575,
"learning_rate": 1.1142857142857143e-05,
"loss": -0.1296,
"num_tokens": 984804.0,
"reward": 0.2805420160293579,
"reward_std": 0.17220479249954224,
"rewards/rollout_reward_func/mean": 0.2805420160293579,
"rewards/rollout_reward_func/std": 0.2652944028377533,
"sampling/importance_sampling_ratio/max": 1.4470672607421875,
"sampling/importance_sampling_ratio/mean": 0.9239229559898376,
"sampling/importance_sampling_ratio/min": 4.1741555338842856e-17,
"sampling/sampling_logp_difference/max": 17.244997024536133,
"sampling/sampling_logp_difference/mean": 0.0690554603934288,
"step": 14,
"step_time": 2234.725167364999
},
{
"clip_ratio/high_max": 0.001851851896693309,
"clip_ratio/high_mean": 0.001851851896693309,
"clip_ratio/low_mean": 0.0009523809421807528,
"clip_ratio/low_min": 0.0009523809421807528,
"clip_ratio/region_mean": 0.002804232838874062,
"completions/clipped_ratio": 0.0416666679084301,
"completions/max_length": 2404.0,
"completions/max_terminated_length": 2404.0,
"completions/mean_length": 1999.125,
"completions/mean_terminated_length": 1988.5653076171875,
"completions/min_length": 751.0,
"completions/min_terminated_length": 751.0,
"entropy": 0.18070783466100693,
"epoch": 0.0009000360014400576,
"frac_reward_zero_std": 0.1666666716337204,
"grad_norm": 1.9805845022201538,
"kl": 0.017273214490463335,
"learning_rate": 1.2e-05,
"loss": 0.1801,
"num_tokens": 1053620.0,
"reward": 0.3113042116165161,
"reward_std": 0.2151390165090561,
"rewards/rollout_reward_func/mean": 0.3113042116165161,
"rewards/rollout_reward_func/std": 0.31088533997535706,
"sampling/importance_sampling_ratio/max": 1.6857080459594727,
"sampling/importance_sampling_ratio/mean": 0.922259509563446,
"sampling/importance_sampling_ratio/min": 4.6535773281716764e-14,
"sampling/sampling_logp_difference/max": 20.623287200927734,
"sampling/sampling_logp_difference/mean": 0.05669760704040527,
"step": 15,
"step_time": 2093.846225461999
}
],
"logging_steps": 1.0,
"max_steps": 33332,
"num_input_tokens_seen": 1053620,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}