gin_rummy_2G / trainer_state.json
Gege24's picture
Upload task output 1
726f2f1 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.006,
"eval_steps": 500,
"global_step": 75,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2813.0,
"completions/max_terminated_length": 2813.0,
"completions/mean_length": 2062.46875,
"completions/mean_terminated_length": 2062.46875,
"completions/min_length": 1052.0,
"completions/min_terminated_length": 1052.0,
"entropy": 0.1340037016198039,
"epoch": 8e-05,
"frac_reward_zero_std": 0.625,
"grad_norm": 0.9753277897834778,
"kl": 0.0,
"learning_rate": 0.0,
"loss": 0.0097,
"num_tokens": 78941.0,
"reward": 0.5106250047683716,
"reward_std": 0.16591878235340118,
"rewards/rollout_reward_func/mean": 0.5106250047683716,
"rewards/rollout_reward_func/std": 0.38574549555778503,
"sampling/importance_sampling_ratio/max": 1.89468514919281,
"sampling/importance_sampling_ratio/mean": 0.917938768863678,
"sampling/importance_sampling_ratio/min": 0.26035696268081665,
"sampling/sampling_logp_difference/max": 1.035329818725586,
"sampling/sampling_logp_difference/mean": 0.020964600145816803,
"step": 1,
"step_time": 18.408817325000086
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2795.0,
"completions/max_terminated_length": 2795.0,
"completions/mean_length": 2091.09375,
"completions/mean_terminated_length": 2091.09375,
"completions/min_length": 1053.0,
"completions/min_terminated_length": 1053.0,
"entropy": 0.1304742144420743,
"epoch": 0.00016,
"frac_reward_zero_std": 0.625,
"grad_norm": 1.355797290802002,
"kl": 0.0,
"learning_rate": 2.2857142857142855e-07,
"loss": -0.0694,
"num_tokens": 158774.0,
"reward": 0.38593748211860657,
"reward_std": 0.15246255695819855,
"rewards/rollout_reward_func/mean": 0.38593748211860657,
"rewards/rollout_reward_func/std": 0.3016391694545746,
"sampling/importance_sampling_ratio/max": 2.747450828552246,
"sampling/importance_sampling_ratio/mean": 0.995655357837677,
"sampling/importance_sampling_ratio/min": 0.30046260356903076,
"sampling/sampling_logp_difference/max": 1.1795392036437988,
"sampling/sampling_logp_difference/mean": 0.022426610812544823,
"step": 2,
"step_time": 17.007036188999905
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2783.0,
"completions/max_terminated_length": 2783.0,
"completions/mean_length": 1881.75,
"completions/mean_terminated_length": 1881.75,
"completions/min_length": 1052.0,
"completions/min_terminated_length": 1052.0,
"entropy": 0.12374210823327303,
"epoch": 0.00024,
"frac_reward_zero_std": 0.625,
"grad_norm": 0.6671572327613831,
"kl": 0.0014705628045703634,
"learning_rate": 4.571428571428571e-07,
"loss": -0.0235,
"num_tokens": 231260.0,
"reward": 0.4012500047683716,
"reward_std": 0.20683754980564117,
"rewards/rollout_reward_func/mean": 0.4012500047683716,
"rewards/rollout_reward_func/std": 0.33187392354011536,
"sampling/importance_sampling_ratio/max": 1.3709925413131714,
"sampling/importance_sampling_ratio/mean": 0.8955257534980774,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 0.9760880470275879,
"sampling/sampling_logp_difference/mean": 0.020091338083148003,
"step": 3,
"step_time": 16.527492177000227
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2795.0,
"completions/max_terminated_length": 2795.0,
"completions/mean_length": 2263.03125,
"completions/mean_terminated_length": 2263.03125,
"completions/min_length": 1569.0,
"completions/min_terminated_length": 1569.0,
"entropy": 0.15253359219059348,
"epoch": 0.00032,
"frac_reward_zero_std": 0.875,
"grad_norm": 0.9312232136726379,
"kl": 0.0016055026353569701,
"learning_rate": 6.857142857142857e-07,
"loss": -0.0241,
"num_tokens": 316879.0,
"reward": 0.3787500262260437,
"reward_std": 0.0624999962747097,
"rewards/rollout_reward_func/mean": 0.3787500262260437,
"rewards/rollout_reward_func/std": 0.26268768310546875,
"sampling/importance_sampling_ratio/max": 1.9853609800338745,
"sampling/importance_sampling_ratio/mean": 0.9613277912139893,
"sampling/importance_sampling_ratio/min": 0.4037262201309204,
"sampling/sampling_logp_difference/max": 0.6126779317855835,
"sampling/sampling_logp_difference/mean": 0.02291969209909439,
"step": 4,
"step_time": 18.10109623099993
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.003289473708719015,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.003289473708719015,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2807.0,
"completions/max_terminated_length": 2807.0,
"completions/mean_length": 2197.1875,
"completions/mean_terminated_length": 2197.1875,
"completions/min_length": 1570.0,
"completions/min_terminated_length": 1570.0,
"entropy": 0.14702600054442883,
"epoch": 0.0004,
"frac_reward_zero_std": 0.5,
"grad_norm": 2.2064177989959717,
"kl": 0.002026251120696543,
"learning_rate": 9.142857142857142e-07,
"loss": -0.0169,
"num_tokens": 400370.0,
"reward": 0.4140625,
"reward_std": 0.15217570960521698,
"rewards/rollout_reward_func/mean": 0.4140625,
"rewards/rollout_reward_func/std": 0.33838188648223877,
"sampling/importance_sampling_ratio/max": 2.348391532897949,
"sampling/importance_sampling_ratio/mean": 1.0998704433441162,
"sampling/importance_sampling_ratio/min": 0.6395935416221619,
"sampling/sampling_logp_difference/max": 0.6357507705688477,
"sampling/sampling_logp_difference/mean": 0.020555175840854645,
"step": 5,
"step_time": 16.967482953000058
},
{
"clip_ratio/high_max": 0.009868421126157045,
"clip_ratio/high_mean": 0.004934210563078523,
"clip_ratio/low_mean": 0.0034829721553251147,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.008417182718403637,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2449.0,
"completions/max_terminated_length": 2449.0,
"completions/mean_length": 1818.28125,
"completions/mean_terminated_length": 1818.28125,
"completions/min_length": 1052.0,
"completions/min_terminated_length": 1052.0,
"entropy": 0.0911772302351892,
"epoch": 0.00048,
"frac_reward_zero_std": 0.625,
"grad_norm": 0.8283352255821228,
"kl": 0.001795254382159328,
"learning_rate": 1.1428571428571428e-06,
"loss": -0.0209,
"num_tokens": 470216.0,
"reward": 0.5893750190734863,
"reward_std": 0.1562499850988388,
"rewards/rollout_reward_func/mean": 0.5893750190734863,
"rewards/rollout_reward_func/std": 0.4223737418651581,
"sampling/importance_sampling_ratio/max": 1.7877978086471558,
"sampling/importance_sampling_ratio/mean": 0.9894595146179199,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 0.9945626258850098,
"sampling/sampling_logp_difference/mean": 0.01755390875041485,
"step": 6,
"step_time": 15.150656265999714
},
{
"clip_ratio/high_max": 0.009375000139698386,
"clip_ratio/high_mean": 0.0062500000931322575,
"clip_ratio/low_mean": 0.006411405862309039,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.012661405955441296,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2791.0,
"completions/max_terminated_length": 2791.0,
"completions/mean_length": 2104.28125,
"completions/mean_terminated_length": 2104.28125,
"completions/min_length": 1053.0,
"completions/min_terminated_length": 1053.0,
"entropy": 0.16090343240648508,
"epoch": 0.00056,
"frac_reward_zero_std": 0.5,
"grad_norm": 1.4131731986999512,
"kl": 0.002872211887734011,
"learning_rate": 1.3714285714285715e-06,
"loss": -0.03,
"num_tokens": 550373.0,
"reward": 0.29500001668930054,
"reward_std": 0.0949999988079071,
"rewards/rollout_reward_func/mean": 0.29500001668930054,
"rewards/rollout_reward_func/std": 0.1687716543674469,
"sampling/importance_sampling_ratio/max": 1.3797976970672607,
"sampling/importance_sampling_ratio/mean": 0.9415616989135742,
"sampling/importance_sampling_ratio/min": 0.29769256711006165,
"sampling/sampling_logp_difference/max": 0.9464168548583984,
"sampling/sampling_logp_difference/mean": 0.02406277321279049,
"step": 7,
"step_time": 16.95326116199999
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0016447368543595076,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0016447368543595076,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2814.0,
"completions/max_terminated_length": 2814.0,
"completions/mean_length": 2265.5625,
"completions/mean_terminated_length": 2265.5625,
"completions/min_length": 1569.0,
"completions/min_terminated_length": 1569.0,
"entropy": 0.20542557537555695,
"epoch": 0.00064,
"frac_reward_zero_std": 0.75,
"grad_norm": 1.0486057996749878,
"kl": 0.00249500987411011,
"learning_rate": 1.6e-06,
"loss": 0.0307,
"num_tokens": 636318.0,
"reward": 0.3434374928474426,
"reward_std": 0.08029377460479736,
"rewards/rollout_reward_func/mean": 0.3434374928474426,
"rewards/rollout_reward_func/std": 0.22245851159095764,
"sampling/importance_sampling_ratio/max": 2.239882230758667,
"sampling/importance_sampling_ratio/mean": 0.9191794395446777,
"sampling/importance_sampling_ratio/min": 0.3405879735946655,
"sampling/sampling_logp_difference/max": 1.0898922681808472,
"sampling/sampling_logp_difference/mean": 0.023122236132621765,
"step": 8,
"step_time": 18.007336240000086
},
{
"clip_ratio/high_max": 0.010620915098115802,
"clip_ratio/high_mean": 0.005310457549057901,
"clip_ratio/low_mean": 0.0016447368543595076,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0069551944034174085,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2790.0,
"completions/max_terminated_length": 2790.0,
"completions/mean_length": 1646.03125,
"completions/mean_terminated_length": 1646.03125,
"completions/min_length": 1054.0,
"completions/min_terminated_length": 1054.0,
"entropy": 0.13724102126434445,
"epoch": 0.00072,
"frac_reward_zero_std": 0.25,
"grad_norm": 1.481195092201233,
"kl": 0.0015346993204730097,
"learning_rate": 1.8285714285714284e-06,
"loss": -0.0189,
"num_tokens": 701005.0,
"reward": 0.6278125047683716,
"reward_std": 0.30871257185935974,
"rewards/rollout_reward_func/mean": 0.6278125047683716,
"rewards/rollout_reward_func/std": 0.4518972933292389,
"sampling/importance_sampling_ratio/max": 1.9988352060317993,
"sampling/importance_sampling_ratio/mean": 0.9468162059783936,
"sampling/importance_sampling_ratio/min": 0.4476884603500366,
"sampling/sampling_logp_difference/max": 0.773470401763916,
"sampling/sampling_logp_difference/mean": 0.02107076346874237,
"step": 9,
"step_time": 16.103736942000182
},
{
"clip_ratio/high_max": 0.008938953513279557,
"clip_ratio/high_mean": 0.004469476756639779,
"clip_ratio/low_mean": 0.0015243901871144772,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005993866943754256,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2799.0,
"completions/max_terminated_length": 2799.0,
"completions/mean_length": 2226.4375,
"completions/mean_terminated_length": 2226.4375,
"completions/min_length": 1052.0,
"completions/min_terminated_length": 1052.0,
"entropy": 0.18680323101580143,
"epoch": 0.0008,
"frac_reward_zero_std": 0.625,
"grad_norm": 2.0065183639526367,
"kl": 0.002535051797167398,
"learning_rate": 2.057142857142857e-06,
"loss": 0.0495,
"num_tokens": 785722.0,
"reward": 0.4443749785423279,
"reward_std": 0.08841878175735474,
"rewards/rollout_reward_func/mean": 0.4443749785423279,
"rewards/rollout_reward_func/std": 0.35590803623199463,
"sampling/importance_sampling_ratio/max": 2.790905714035034,
"sampling/importance_sampling_ratio/mean": 0.940368115901947,
"sampling/importance_sampling_ratio/min": 0.22617992758750916,
"sampling/sampling_logp_difference/max": 0.6851506233215332,
"sampling/sampling_logp_difference/mean": 0.025203729048371315,
"step": 10,
"step_time": 17.030096009999966
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0015625000232830644,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0015625000232830644,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2783.0,
"completions/max_terminated_length": 2783.0,
"completions/mean_length": 1966.875,
"completions/mean_terminated_length": 1966.875,
"completions/min_length": 1054.0,
"completions/min_terminated_length": 1054.0,
"entropy": 0.1440325272269547,
"epoch": 0.00088,
"frac_reward_zero_std": 0.625,
"grad_norm": 1.0474733114242554,
"kl": 0.0011928395033464767,
"learning_rate": 2.2857142857142856e-06,
"loss": -0.0121,
"num_tokens": 861366.0,
"reward": 0.4596875011920929,
"reward_std": 0.14279377460479736,
"rewards/rollout_reward_func/mean": 0.4596875011920929,
"rewards/rollout_reward_func/std": 0.38282889127731323,
"sampling/importance_sampling_ratio/max": 1.9087510108947754,
"sampling/importance_sampling_ratio/mean": 0.9341506361961365,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 0.9133691787719727,
"sampling/sampling_logp_difference/mean": 0.02222413383424282,
"step": 11,
"step_time": 16.685985845999994
},
{
"clip_ratio/high_max": 0.007352941203862429,
"clip_ratio/high_mean": 0.0055147059028968215,
"clip_ratio/low_mean": 0.0036764706019312143,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009191176504828036,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2801.0,
"completions/max_terminated_length": 2801.0,
"completions/mean_length": 1857.96875,
"completions/mean_terminated_length": 1857.96875,
"completions/min_length": 1052.0,
"completions/min_terminated_length": 1052.0,
"entropy": 0.17092999629676342,
"epoch": 0.00096,
"frac_reward_zero_std": 0.5,
"grad_norm": 2.0095906257629395,
"kl": 0.0024750066513661295,
"learning_rate": 2.5142857142857142e-06,
"loss": -0.0228,
"num_tokens": 933262.0,
"reward": 0.48593753576278687,
"reward_std": 0.20529377460479736,
"rewards/rollout_reward_func/mean": 0.48593753576278687,
"rewards/rollout_reward_func/std": 0.38762184977531433,
"sampling/importance_sampling_ratio/max": 1.6060364246368408,
"sampling/importance_sampling_ratio/mean": 0.9451028108596802,
"sampling/importance_sampling_ratio/min": 0.34411877393722534,
"sampling/sampling_logp_difference/max": 1.0912601947784424,
"sampling/sampling_logp_difference/mean": 0.019420120865106583,
"step": 12,
"step_time": 17.06754343600005
},
{
"clip_ratio/high_max": 0.007936508161947131,
"clip_ratio/high_mean": 0.003968254080973566,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0057043652050197124,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2812.0,
"completions/max_terminated_length": 2812.0,
"completions/mean_length": 1741.5,
"completions/mean_terminated_length": 1741.5,
"completions/min_length": 1052.0,
"completions/min_terminated_length": 1052.0,
"entropy": 0.11786343855783343,
"epoch": 0.00104,
"frac_reward_zero_std": 0.5,
"grad_norm": 2.0365312099456787,
"kl": 0.0034534272163000423,
"learning_rate": 2.742857142857143e-06,
"loss": -0.0389,
"num_tokens": 1001046.0,
"reward": 0.6475000381469727,
"reward_std": 0.24292194843292236,
"rewards/rollout_reward_func/mean": 0.6475000381469727,
"rewards/rollout_reward_func/std": 0.4391413629055023,
"sampling/importance_sampling_ratio/max": 2.502153158187866,
"sampling/importance_sampling_ratio/mean": 1.0042061805725098,
"sampling/importance_sampling_ratio/min": 0.4875127375125885,
"sampling/sampling_logp_difference/max": 0.6685242652893066,
"sampling/sampling_logp_difference/mean": 0.01504062581807375,
"step": 13,
"step_time": 16.125135786999863
},
{
"clip_ratio/high_max": 0.011430230224505067,
"clip_ratio/high_mean": 0.0057151151122525334,
"clip_ratio/low_mean": 0.0022321429569274187,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.007947258069179952,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2802.0,
"completions/max_terminated_length": 2802.0,
"completions/mean_length": 2010.3125,
"completions/mean_terminated_length": 2010.3125,
"completions/min_length": 1053.0,
"completions/min_terminated_length": 1053.0,
"entropy": 0.1693209670484066,
"epoch": 0.00112,
"frac_reward_zero_std": 0.5,
"grad_norm": 1.2513231039047241,
"kl": 0.002481764371623285,
"learning_rate": 2.9714285714285716e-06,
"loss": 0.0309,
"num_tokens": 1078101.0,
"reward": 0.4753125011920929,
"reward_std": 0.19296419620513916,
"rewards/rollout_reward_func/mean": 0.4753125011920929,
"rewards/rollout_reward_func/std": 0.3761346936225891,
"sampling/importance_sampling_ratio/max": 1.5099362134933472,
"sampling/importance_sampling_ratio/mean": 0.9022589921951294,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.6567137241363525,
"sampling/sampling_logp_difference/mean": 0.022564683109521866,
"step": 14,
"step_time": 16.688260055
},
{
"clip_ratio/high_max": 0.0036764706019312143,
"clip_ratio/high_mean": 0.0018382353009656072,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.003574346425011754,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2775.0,
"completions/max_terminated_length": 2775.0,
"completions/mean_length": 1887.71875,
"completions/mean_terminated_length": 1887.71875,
"completions/min_length": 1054.0,
"completions/min_terminated_length": 1054.0,
"entropy": 0.14878523536026478,
"epoch": 0.0012,
"frac_reward_zero_std": 0.5,
"grad_norm": 1.092544674873352,
"kl": 0.0021166762671782635,
"learning_rate": 3.2e-06,
"loss": -0.014,
"num_tokens": 1150803.0,
"reward": 0.4506249725818634,
"reward_std": 0.2620203495025635,
"rewards/rollout_reward_func/mean": 0.4506249725818634,
"rewards/rollout_reward_func/std": 0.3878471255302429,
"sampling/importance_sampling_ratio/max": 2.092060089111328,
"sampling/importance_sampling_ratio/mean": 1.0061091184616089,
"sampling/importance_sampling_ratio/min": 0.519212543964386,
"sampling/sampling_logp_difference/max": 0.664109468460083,
"sampling/sampling_logp_difference/mean": 0.02180980145931244,
"step": 15,
"step_time": 16.993085070999996
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0016891892300918698,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0016891892300918698,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2812.0,
"completions/max_terminated_length": 2812.0,
"completions/mean_length": 2229.28125,
"completions/mean_terminated_length": 2229.28125,
"completions/min_length": 1052.0,
"completions/min_terminated_length": 1052.0,
"entropy": 0.15634657256305218,
"epoch": 0.00128,
"frac_reward_zero_std": 0.875,
"grad_norm": 0.09010659158229828,
"kl": 0.005270412558274984,
"learning_rate": 3.428571428571428e-06,
"loss": 0.005,
"num_tokens": 1235464.0,
"reward": 0.48374998569488525,
"reward_std": 0.0625,
"rewards/rollout_reward_func/mean": 0.48374998569488525,
"rewards/rollout_reward_func/std": 0.36630454659461975,
"sampling/importance_sampling_ratio/max": 1.890110969543457,
"sampling/importance_sampling_ratio/mean": 0.9257134199142456,
"sampling/importance_sampling_ratio/min": 0.4630853831768036,
"sampling/sampling_logp_difference/max": 0.69629967212677,
"sampling/sampling_logp_difference/mean": 0.020435180515050888,
"step": 16,
"step_time": 17.667978367999467
},
{
"clip_ratio/high_max": 0.016176471021026373,
"clip_ratio/high_mean": 0.009732972481288016,
"clip_ratio/low_mean": 0.0016447368543595076,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.011377709335647523,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2449.0,
"completions/max_terminated_length": 2449.0,
"completions/mean_length": 1750.53125,
"completions/mean_terminated_length": 1750.53125,
"completions/min_length": 1052.0,
"completions/min_terminated_length": 1052.0,
"entropy": 0.13329231040552258,
"epoch": 0.00136,
"frac_reward_zero_std": 0.375,
"grad_norm": 1.2919148206710815,
"kl": 0.0019803230388788506,
"learning_rate": 3.657142857142857e-06,
"loss": 0.0395,
"num_tokens": 1303537.0,
"reward": 0.5606250166893005,
"reward_std": 0.2351399064064026,
"rewards/rollout_reward_func/mean": 0.5606250166893005,
"rewards/rollout_reward_func/std": 0.41600972414016724,
"sampling/importance_sampling_ratio/max": 1.4119406938552856,
"sampling/importance_sampling_ratio/mean": 0.9287126064300537,
"sampling/importance_sampling_ratio/min": 0.3827318847179413,
"sampling/sampling_logp_difference/max": 0.6636209487915039,
"sampling/sampling_logp_difference/mean": 0.017019610852003098,
"step": 17,
"step_time": 15.075951100999873
},
{
"clip_ratio/high_max": 0.0062500000931322575,
"clip_ratio/high_mean": 0.0031250000465661287,
"clip_ratio/low_mean": 0.0016447368543595076,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004769736900925636,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2799.0,
"completions/max_terminated_length": 2799.0,
"completions/mean_length": 2316.9375,
"completions/mean_terminated_length": 2316.9375,
"completions/min_length": 1053.0,
"completions/min_terminated_length": 1053.0,
"entropy": 0.16856362204998732,
"epoch": 0.00144,
"frac_reward_zero_std": 0.625,
"grad_norm": 2.867643356323242,
"kl": 0.005084036383777857,
"learning_rate": 3.885714285714286e-06,
"loss": -0.0049,
"num_tokens": 1391780.0,
"reward": 0.37031251192092896,
"reward_std": 0.10187499970197678,
"rewards/rollout_reward_func/mean": 0.37031251192092896,
"rewards/rollout_reward_func/std": 0.2657124996185303,
"sampling/importance_sampling_ratio/max": 1.9782981872558594,
"sampling/importance_sampling_ratio/mean": 1.0121815204620361,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 0.48717403411865234,
"sampling/sampling_logp_difference/mean": 0.022180214524269104,
"step": 18,
"step_time": 17.064124244999903
},
{
"clip_ratio/high_max": 0.0062500000931322575,
"clip_ratio/high_mean": 0.0031250000465661287,
"clip_ratio/low_mean": 0.0015243901871144772,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004649390233680606,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2798.0,
"completions/max_terminated_length": 2798.0,
"completions/mean_length": 2216.0,
"completions/mean_terminated_length": 2216.0,
"completions/min_length": 1563.0,
"completions/min_terminated_length": 1563.0,
"entropy": 0.1883529694750905,
"epoch": 0.00152,
"frac_reward_zero_std": 0.875,
"grad_norm": 2.2749135494232178,
"kl": 0.005091317143524066,
"learning_rate": 4.114285714285714e-06,
"loss": 0.0455,
"num_tokens": 1475873.0,
"reward": 0.4059374928474426,
"reward_std": 0.00812500063329935,
"rewards/rollout_reward_func/mean": 0.4059374928474426,
"rewards/rollout_reward_func/std": 0.29813244938850403,
"sampling/importance_sampling_ratio/max": 2.81974196434021,
"sampling/importance_sampling_ratio/mean": 1.0345871448516846,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 0.8696746826171875,
"sampling/sampling_logp_difference/mean": 0.02671925723552704,
"step": 19,
"step_time": 17.602816416999985
},
{
"clip_ratio/high_max": 0.011488970601931214,
"clip_ratio/high_mean": 0.005744485300965607,
"clip_ratio/low_mean": 0.0048926768358796835,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01063716213684529,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2819.0,
"completions/max_terminated_length": 2819.0,
"completions/mean_length": 1814.28125,
"completions/mean_terminated_length": 1814.28125,
"completions/min_length": 1053.0,
"completions/min_terminated_length": 1053.0,
"entropy": 0.1282729902304709,
"epoch": 0.0016,
"frac_reward_zero_std": 0.5,
"grad_norm": 1.691245198249817,
"kl": 0.002560590350185521,
"learning_rate": 4.342857142857142e-06,
"loss": -0.0259,
"num_tokens": 1546081.0,
"reward": 0.612500011920929,
"reward_std": 0.18216876685619354,
"rewards/rollout_reward_func/mean": 0.612500011920929,
"rewards/rollout_reward_func/std": 0.43820008635520935,
"sampling/importance_sampling_ratio/max": 1.9052116870880127,
"sampling/importance_sampling_ratio/mean": 1.007737636566162,
"sampling/importance_sampling_ratio/min": 0.5377175211906433,
"sampling/sampling_logp_difference/max": 0.6581223011016846,
"sampling/sampling_logp_difference/mean": 0.01716558076441288,
"step": 20,
"step_time": 16.950590521000322
},
{
"clip_ratio/high_max": 0.010667945956811309,
"clip_ratio/high_mean": 0.005333972978405654,
"clip_ratio/low_mean": 0.004784891498275101,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010118864476680756,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2791.0,
"completions/max_terminated_length": 2791.0,
"completions/mean_length": 1928.25,
"completions/mean_terminated_length": 1928.25,
"completions/min_length": 1054.0,
"completions/min_terminated_length": 1054.0,
"entropy": 0.1822828585281968,
"epoch": 0.00168,
"frac_reward_zero_std": 0.5,
"grad_norm": 1.4209487438201904,
"kl": 0.0024251511349575594,
"learning_rate": 4.571428571428571e-06,
"loss": -0.038,
"num_tokens": 1620395.0,
"reward": 0.6059374809265137,
"reward_std": 0.20529377460479736,
"rewards/rollout_reward_func/mean": 0.6059374809265137,
"rewards/rollout_reward_func/std": 0.4463822841644287,
"sampling/importance_sampling_ratio/max": 1.6781816482543945,
"sampling/importance_sampling_ratio/mean": 1.0399606227874756,
"sampling/importance_sampling_ratio/min": 0.30244705080986023,
"sampling/sampling_logp_difference/max": 0.7038769721984863,
"sampling/sampling_logp_difference/mean": 0.028661729767918587,
"step": 21,
"step_time": 16.793115096000065
},
{
"clip_ratio/high_max": 0.01126575656235218,
"clip_ratio/high_mean": 0.00563287828117609,
"clip_ratio/low_mean": 0.0018382353009656072,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.007471113582141697,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2806.0,
"completions/max_terminated_length": 2806.0,
"completions/mean_length": 1895.0,
"completions/mean_terminated_length": 1895.0,
"completions/min_length": 1052.0,
"completions/min_terminated_length": 1052.0,
"entropy": 0.11727871629409492,
"epoch": 0.00176,
"frac_reward_zero_std": 0.375,
"grad_norm": 1.4466348886489868,
"kl": 0.003775272169150412,
"learning_rate": 4.8e-06,
"loss": 0.0009,
"num_tokens": 1693497.0,
"reward": 0.5303125381469727,
"reward_std": 0.2740437984466553,
"rewards/rollout_reward_func/mean": 0.5303125381469727,
"rewards/rollout_reward_func/std": 0.4203799068927765,
"sampling/importance_sampling_ratio/max": 1.6391761302947998,
"sampling/importance_sampling_ratio/mean": 0.9023667573928833,
"sampling/importance_sampling_ratio/min": 0.30269956588745117,
"sampling/sampling_logp_difference/max": 1.103229284286499,
"sampling/sampling_logp_difference/mean": 0.028024764731526375,
"step": 22,
"step_time": 16.31928637299984
},
{
"clip_ratio/high_max": 0.006761695956811309,
"clip_ratio/high_mean": 0.0033808479784056544,
"clip_ratio/low_mean": 0.0034829722717404366,
"clip_ratio/low_min": 0.003289473708719015,
"clip_ratio/region_mean": 0.006863820250146091,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2795.0,
"completions/max_terminated_length": 2795.0,
"completions/mean_length": 1923.9375,
"completions/mean_terminated_length": 1923.9375,
"completions/min_length": 1052.0,
"completions/min_terminated_length": 1052.0,
"entropy": 0.14631808176636696,
"epoch": 0.00184,
"frac_reward_zero_std": 0.625,
"grad_norm": 1.9489635229110718,
"kl": 0.0022731795979780145,
"learning_rate": 5.0285714285714285e-06,
"loss": -0.0229,
"num_tokens": 1767354.0,
"reward": 0.4168750047683716,
"reward_std": 0.20417675375938416,
"rewards/rollout_reward_func/mean": 0.4168750047683716,
"rewards/rollout_reward_func/std": 0.33288994431495667,
"sampling/importance_sampling_ratio/max": 2.5520823001861572,
"sampling/importance_sampling_ratio/mean": 1.124953269958496,
"sampling/importance_sampling_ratio/min": 0.3814745247364044,
"sampling/sampling_logp_difference/max": 0.9945569038391113,
"sampling/sampling_logp_difference/mean": 0.021298212930560112,
"step": 23,
"step_time": 16.909164390999877
},
{
"clip_ratio/high_max": 0.0062500000931322575,
"clip_ratio/high_mean": 0.0031250000465661287,
"clip_ratio/low_mean": 0.0015625000232830644,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004687500069849193,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2798.0,
"completions/max_terminated_length": 2798.0,
"completions/mean_length": 2009.28125,
"completions/mean_terminated_length": 2009.28125,
"completions/min_length": 1054.0,
"completions/min_terminated_length": 1054.0,
"entropy": 0.18214968033134937,
"epoch": 0.00192,
"frac_reward_zero_std": 0.5,
"grad_norm": 1.610776662826538,
"kl": 0.003005845101142768,
"learning_rate": 5.257142857142857e-06,
"loss": -0.0704,
"num_tokens": 1844719.0,
"reward": 0.4243749976158142,
"reward_std": 0.21341876685619354,
"rewards/rollout_reward_func/mean": 0.4243749976158142,
"rewards/rollout_reward_func/std": 0.362561970949173,
"sampling/importance_sampling_ratio/max": 1.7062076330184937,
"sampling/importance_sampling_ratio/mean": 1.0438251495361328,
"sampling/importance_sampling_ratio/min": 0.3492918312549591,
"sampling/sampling_logp_difference/max": 0.5720778703689575,
"sampling/sampling_logp_difference/mean": 0.023630155250430107,
"step": 24,
"step_time": 17.14499585999988
},
{
"clip_ratio/high_max": 0.0031250000465661287,
"clip_ratio/high_mean": 0.0015625000232830644,
"clip_ratio/low_mean": 0.0031250000465661287,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004687500069849193,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2440.0,
"completions/max_terminated_length": 2440.0,
"completions/mean_length": 2006.6875,
"completions/mean_terminated_length": 2006.6875,
"completions/min_length": 1055.0,
"completions/min_terminated_length": 1055.0,
"entropy": 0.11316484399139881,
"epoch": 0.002,
"frac_reward_zero_std": 0.75,
"grad_norm": 1.0315803289413452,
"kl": 0.0030158030276652426,
"learning_rate": 5.485714285714286e-06,
"loss": 0.0116,
"num_tokens": 1921270.0,
"reward": 0.47968751192092896,
"reward_std": 0.07062499970197678,
"rewards/rollout_reward_func/mean": 0.47968751192092896,
"rewards/rollout_reward_func/std": 0.36560583114624023,
"sampling/importance_sampling_ratio/max": 1.54865562915802,
"sampling/importance_sampling_ratio/mean": 0.9767700433731079,
"sampling/importance_sampling_ratio/min": 0.5038349628448486,
"sampling/sampling_logp_difference/max": 0.4957547187805176,
"sampling/sampling_logp_difference/mean": 0.01319466158747673,
"step": 25,
"step_time": 15.574967514000036
},
{
"clip_ratio/high_max": 0.007352941203862429,
"clip_ratio/high_mean": 0.0036764706019312143,
"clip_ratio/low_mean": 0.0015625000232830644,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005238970625214279,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2816.0,
"completions/max_terminated_length": 2816.0,
"completions/mean_length": 2016.375,
"completions/mean_terminated_length": 2016.375,
"completions/min_length": 1052.0,
"completions/min_terminated_length": 1052.0,
"entropy": 0.1327181551605463,
"epoch": 0.00208,
"frac_reward_zero_std": 0.625,
"grad_norm": 1.1481389999389648,
"kl": 0.0036689931839646306,
"learning_rate": 5.7142857142857145e-06,
"loss": -0.004,
"num_tokens": 1998617.0,
"reward": 0.4909375011920929,
"reward_std": 0.12780338525772095,
"rewards/rollout_reward_func/mean": 0.4909375011920929,
"rewards/rollout_reward_func/std": 0.3668818771839142,
"sampling/importance_sampling_ratio/max": 1.5999431610107422,
"sampling/importance_sampling_ratio/mean": 0.9634629487991333,
"sampling/importance_sampling_ratio/min": 0.2564904987812042,
"sampling/sampling_logp_difference/max": 0.6982665061950684,
"sampling/sampling_logp_difference/mean": 0.01972239464521408,
"step": 26,
"step_time": 17.251899020999645
},
{
"clip_ratio/high_max": 0.008928571827709675,
"clip_ratio/high_mean": 0.004464285913854837,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004464285913854837,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2810.0,
"completions/max_terminated_length": 2810.0,
"completions/mean_length": 1849.84375,
"completions/mean_terminated_length": 1849.84375,
"completions/min_length": 1052.0,
"completions/min_terminated_length": 1052.0,
"entropy": 0.09636542806401849,
"epoch": 0.00216,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.7088967561721802,
"kl": 0.002837517200532602,
"learning_rate": 5.942857142857143e-06,
"loss": -0.0039,
"num_tokens": 2070057.0,
"reward": 0.5687500238418579,
"reward_std": 0.2160891890525818,
"rewards/rollout_reward_func/mean": 0.5687500238418579,
"rewards/rollout_reward_func/std": 0.4140106439590454,
"sampling/importance_sampling_ratio/max": 1.6554824113845825,
"sampling/importance_sampling_ratio/mean": 1.0057581663131714,
"sampling/importance_sampling_ratio/min": 0.13771295547485352,
"sampling/sampling_logp_difference/max": 1.693850040435791,
"sampling/sampling_logp_difference/mean": 0.015657048672437668,
"step": 27,
"step_time": 16.657898435000106
},
{
"clip_ratio/high_max": 0.0036764706019312143,
"clip_ratio/high_mean": 0.0018382353009656072,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0018382353009656072,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2800.0,
"completions/max_terminated_length": 2800.0,
"completions/mean_length": 2166.28125,
"completions/mean_terminated_length": 2166.28125,
"completions/min_length": 1053.0,
"completions/min_terminated_length": 1053.0,
"entropy": 0.11421543313190341,
"epoch": 0.00224,
"frac_reward_zero_std": 0.625,
"grad_norm": 1.0997257232666016,
"kl": 0.001993668673094362,
"learning_rate": 6.171428571428571e-06,
"loss": 0.0093,
"num_tokens": 2152509.0,
"reward": 0.4090625047683716,
"reward_std": 0.14099711179733276,
"rewards/rollout_reward_func/mean": 0.4090625047683716,
"rewards/rollout_reward_func/std": 0.3153631389141083,
"sampling/importance_sampling_ratio/max": 1.4471888542175293,
"sampling/importance_sampling_ratio/mean": 0.9940881133079529,
"sampling/importance_sampling_ratio/min": 0.36628207564353943,
"sampling/sampling_logp_difference/max": 0.4437229633331299,
"sampling/sampling_logp_difference/mean": 0.01664617471396923,
"step": 28,
"step_time": 16.654660168999953
},
{
"clip_ratio/high_max": 0.014613970648497343,
"clip_ratio/high_mean": 0.009043096331879497,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010996221215464175,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2427.0,
"completions/max_terminated_length": 2427.0,
"completions/mean_length": 1541.1875,
"completions/mean_terminated_length": 1541.1875,
"completions/min_length": 1052.0,
"completions/min_terminated_length": 1052.0,
"entropy": 0.11991730704903603,
"epoch": 0.00232,
"frac_reward_zero_std": 0.375,
"grad_norm": 1.753915548324585,
"kl": 0.004232257339026546,
"learning_rate": 6.4e-06,
"loss": -0.0561,
"num_tokens": 2213064.0,
"reward": 0.7221875190734863,
"reward_std": 0.28713130950927734,
"rewards/rollout_reward_func/mean": 0.7221875190734863,
"rewards/rollout_reward_func/std": 0.47223374247550964,
"sampling/importance_sampling_ratio/max": 1.8491572141647339,
"sampling/importance_sampling_ratio/mean": 0.9664063453674316,
"sampling/importance_sampling_ratio/min": 0.2533723711967468,
"sampling/sampling_logp_difference/max": 0.7475757598876953,
"sampling/sampling_logp_difference/mean": 0.02090834453701973,
"step": 29,
"step_time": 14.724566217999836
},
{
"clip_ratio/high_max": 0.011101973708719015,
"clip_ratio/high_mean": 0.0055509868543595076,
"clip_ratio/low_mean": 0.0036764706019312143,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009227457456290722,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2795.0,
"completions/max_terminated_length": 2795.0,
"completions/mean_length": 1690.28125,
"completions/mean_terminated_length": 1690.28125,
"completions/min_length": 1052.0,
"completions/min_terminated_length": 1052.0,
"entropy": 0.15091887768357992,
"epoch": 0.0024,
"frac_reward_zero_std": 0.25,
"grad_norm": 1.5955613851547241,
"kl": 0.003297277187812142,
"learning_rate": 6.628571428571428e-06,
"loss": -0.0312,
"num_tokens": 2278894.0,
"reward": 0.5768749713897705,
"reward_std": 0.2879711389541626,
"rewards/rollout_reward_func/mean": 0.5768749713897705,
"rewards/rollout_reward_func/std": 0.4466031789779663,
"sampling/importance_sampling_ratio/max": 2.0268845558166504,
"sampling/importance_sampling_ratio/mean": 0.9738575220108032,
"sampling/importance_sampling_ratio/min": 0.3578207492828369,
"sampling/sampling_logp_difference/max": 0.7220923900604248,
"sampling/sampling_logp_difference/mean": 0.023466479033231735,
"step": 30,
"step_time": 15.794334728999956
},
{
"clip_ratio/high_max": 0.008928571827709675,
"clip_ratio/high_mean": 0.006302521098405123,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006302521098405123,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2791.0,
"completions/max_terminated_length": 2791.0,
"completions/mean_length": 1802.28125,
"completions/mean_terminated_length": 1802.28125,
"completions/min_length": 1052.0,
"completions/min_terminated_length": 1052.0,
"entropy": 0.1271415469236672,
"epoch": 0.00248,
"frac_reward_zero_std": 0.625,
"grad_norm": 0.5290101170539856,
"kl": 0.007829649756786239,
"learning_rate": 6.857142857142856e-06,
"loss": 0.0219,
"num_tokens": 2348595.0,
"reward": 0.6468750238418579,
"reward_std": 0.15625,
"rewards/rollout_reward_func/mean": 0.6468750238418579,
"rewards/rollout_reward_func/std": 0.4312205910682678,
"sampling/importance_sampling_ratio/max": 1.4284578561782837,
"sampling/importance_sampling_ratio/mean": 1.008836269378662,
"sampling/importance_sampling_ratio/min": 0.5545295476913452,
"sampling/sampling_logp_difference/max": 0.9234024286270142,
"sampling/sampling_logp_difference/mean": 0.020626772195100784,
"step": 31,
"step_time": 16.82306190600002
},
{
"clip_ratio/high_max": 0.010130719048902392,
"clip_ratio/high_mean": 0.005065359524451196,
"clip_ratio/low_mean": 0.0036210318794474006,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.008686391403898597,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2450.0,
"completions/max_terminated_length": 2450.0,
"completions/mean_length": 1986.28125,
"completions/mean_terminated_length": 1986.28125,
"completions/min_length": 1054.0,
"completions/min_terminated_length": 1054.0,
"entropy": 0.13206067122519016,
"epoch": 0.00256,
"frac_reward_zero_std": 0.5,
"grad_norm": 1.3528786897659302,
"kl": 0.008182306781236548,
"learning_rate": 7.085714285714285e-06,
"loss": -0.0853,
"num_tokens": 2424934.0,
"reward": 0.4606249928474426,
"reward_std": 0.15091878175735474,
"rewards/rollout_reward_func/mean": 0.4606249928474426,
"rewards/rollout_reward_func/std": 0.3846149146556854,
"sampling/importance_sampling_ratio/max": 2.71755313873291,
"sampling/importance_sampling_ratio/mean": 1.051027774810791,
"sampling/importance_sampling_ratio/min": 0.38737601041793823,
"sampling/sampling_logp_difference/max": 0.7733535766601562,
"sampling/sampling_logp_difference/mean": 0.020959284156560898,
"step": 32,
"step_time": 15.57071794400008
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0022321429569274187,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0022321429569274187,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2803.0,
"completions/max_terminated_length": 2803.0,
"completions/mean_length": 2106.9375,
"completions/mean_terminated_length": 2106.9375,
"completions/min_length": 1054.0,
"completions/min_terminated_length": 1054.0,
"entropy": 0.12445190898142755,
"epoch": 0.00264,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.12888294458389282,
"kl": 0.003734915575478226,
"learning_rate": 7.314285714285714e-06,
"loss": -0.0017,
"num_tokens": 2505564.0,
"reward": 0.4637500047683716,
"reward_std": 0.13466876745224,
"rewards/rollout_reward_func/mean": 0.4637500047683716,
"rewards/rollout_reward_func/std": 0.37349048256874084,
"sampling/importance_sampling_ratio/max": 2.2378056049346924,
"sampling/importance_sampling_ratio/mean": 1.0552072525024414,
"sampling/importance_sampling_ratio/min": 0.3374383747577667,
"sampling/sampling_logp_difference/max": 1.084688663482666,
"sampling/sampling_logp_difference/mean": 0.015957504510879517,
"step": 33,
"step_time": 16.374967034000065
},
{
"clip_ratio/high_max": 0.004464285913854837,
"clip_ratio/high_mean": 0.0022321429569274187,
"clip_ratio/low_mean": 0.0022321429569274187,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004464285913854837,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2416.0,
"completions/max_terminated_length": 2416.0,
"completions/mean_length": 1479.28125,
"completions/mean_terminated_length": 1479.28125,
"completions/min_length": 1052.0,
"completions/min_terminated_length": 1052.0,
"entropy": 0.07319290563464165,
"epoch": 0.00272,
"frac_reward_zero_std": 0.375,
"grad_norm": 1.5905581712722778,
"kl": 0.02232863545214059,
"learning_rate": 7.542857142857142e-06,
"loss": 0.0285,
"num_tokens": 2564043.0,
"reward": 0.7212499976158142,
"reward_std": 0.31069982051849365,
"rewards/rollout_reward_func/mean": 0.7212499976158142,
"rewards/rollout_reward_func/std": 0.45388466119766235,
"sampling/importance_sampling_ratio/max": 1.5257185697555542,
"sampling/importance_sampling_ratio/mean": 0.9286473989486694,
"sampling/importance_sampling_ratio/min": 0.35652607679367065,
"sampling/sampling_logp_difference/max": 1.0259580612182617,
"sampling/sampling_logp_difference/mean": 0.0132124163210392,
"step": 34,
"step_time": 14.42571913799975
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2806.0,
"completions/max_terminated_length": 2806.0,
"completions/mean_length": 1752.5,
"completions/mean_terminated_length": 1752.5,
"completions/min_length": 1052.0,
"completions/min_terminated_length": 1052.0,
"entropy": 0.09324552165344357,
"epoch": 0.0028,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.717671811580658,
"kl": 0.004631695612260955,
"learning_rate": 7.771428571428572e-06,
"loss": 0.0108,
"num_tokens": 2632179.0,
"reward": 0.4125000238418579,
"reward_std": 0.26933753490448,
"rewards/rollout_reward_func/mean": 0.4125000238418579,
"rewards/rollout_reward_func/std": 0.36455005407333374,
"sampling/importance_sampling_ratio/max": 1.790269136428833,
"sampling/importance_sampling_ratio/mean": 1.1154439449310303,
"sampling/importance_sampling_ratio/min": 0.7148804068565369,
"sampling/sampling_logp_difference/max": 0.596367359161377,
"sampling/sampling_logp_difference/mean": 0.01612972654402256,
"step": 35,
"step_time": 16.94071342400025
},
{
"clip_ratio/high_max": 0.003289473708719015,
"clip_ratio/high_mean": 0.0016447368543595076,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0016447368543595076,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2780.0,
"completions/max_terminated_length": 2780.0,
"completions/mean_length": 2210.65625,
"completions/mean_terminated_length": 2210.65625,
"completions/min_length": 1565.0,
"completions/min_terminated_length": 1565.0,
"entropy": 0.13052229024469852,
"epoch": 0.00288,
"frac_reward_zero_std": 0.875,
"grad_norm": 1.915448784828186,
"kl": 0.014049735953449272,
"learning_rate": 8e-06,
"loss": 0.0114,
"num_tokens": 2716101.0,
"reward": 0.3787500262260437,
"reward_std": 0.0624999962747097,
"rewards/rollout_reward_func/mean": 0.3787500262260437,
"rewards/rollout_reward_func/std": 0.26268768310546875,
"sampling/importance_sampling_ratio/max": 1.6097471714019775,
"sampling/importance_sampling_ratio/mean": 0.8499077558517456,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 24.667917251586914,
"sampling/sampling_logp_difference/mean": 0.08357222378253937,
"step": 36,
"step_time": 16.647620255999527
},
{
"clip_ratio/high_max": 0.013392857741564512,
"clip_ratio/high_mean": 0.006696428870782256,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006696428870782256,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2787.0,
"completions/max_terminated_length": 2787.0,
"completions/mean_length": 1735.125,
"completions/mean_terminated_length": 1735.125,
"completions/min_length": 1052.0,
"completions/min_terminated_length": 1052.0,
"entropy": 0.08023441676050425,
"epoch": 0.00296,
"frac_reward_zero_std": 0.5,
"grad_norm": 1.0067864656448364,
"kl": 0.014214006034308113,
"learning_rate": 7.999999976246485e-06,
"loss": 0.005,
"num_tokens": 2783558.0,
"reward": 0.5481250286102295,
"reward_std": 0.2473391890525818,
"rewards/rollout_reward_func/mean": 0.5481250286102295,
"rewards/rollout_reward_func/std": 0.41463109850883484,
"sampling/importance_sampling_ratio/max": 1.9839459657669067,
"sampling/importance_sampling_ratio/mean": 1.0556774139404297,
"sampling/importance_sampling_ratio/min": 0.5570288896560669,
"sampling/sampling_logp_difference/max": 0.7343063354492188,
"sampling/sampling_logp_difference/mean": 0.01291065476834774,
"step": 37,
"step_time": 15.852365188000249
},
{
"clip_ratio/high_max": 0.013429548125714064,
"clip_ratio/high_mean": 0.008277274086140096,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.008277274086140096,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2790.0,
"completions/max_terminated_length": 2790.0,
"completions/mean_length": 1825.5625,
"completions/mean_terminated_length": 1825.5625,
"completions/min_length": 1053.0,
"completions/min_terminated_length": 1053.0,
"entropy": 0.07684489572420716,
"epoch": 0.00304,
"frac_reward_zero_std": 0.375,
"grad_norm": 0.9191060662269592,
"kl": 0.008681207757035736,
"learning_rate": 7.999999904985944e-06,
"loss": -0.0211,
"num_tokens": 2854054.0,
"reward": 0.6575000286102295,
"reward_std": 0.32216876745224,
"rewards/rollout_reward_func/mean": 0.6575000286102295,
"rewards/rollout_reward_func/std": 0.46569401025772095,
"sampling/importance_sampling_ratio/max": 1.8603460788726807,
"sampling/importance_sampling_ratio/mean": 1.0407439470291138,
"sampling/importance_sampling_ratio/min": 0.5873942971229553,
"sampling/sampling_logp_difference/max": 0.6204257011413574,
"sampling/sampling_logp_difference/mean": 0.011706141754984856,
"step": 38,
"step_time": 16.086839031999943
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2800.0,
"completions/max_terminated_length": 2800.0,
"completions/mean_length": 2006.0625,
"completions/mean_terminated_length": 2006.0625,
"completions/min_length": 1053.0,
"completions/min_terminated_length": 1053.0,
"entropy": 0.09726127330213785,
"epoch": 0.00312,
"frac_reward_zero_std": 0.75,
"grad_norm": 1.442636489868164,
"kl": 0.01900345625472255,
"learning_rate": 7.999999786218377e-06,
"loss": -0.0229,
"num_tokens": 2930928.0,
"reward": 0.42125001549720764,
"reward_std": 0.13466876745224,
"rewards/rollout_reward_func/mean": 0.42125001549720764,
"rewards/rollout_reward_func/std": 0.3237656354904175,
"sampling/importance_sampling_ratio/max": 1.6864854097366333,
"sampling/importance_sampling_ratio/mean": 0.9331543445587158,
"sampling/importance_sampling_ratio/min": 0.17830577492713928,
"sampling/sampling_logp_difference/max": 0.9498655796051025,
"sampling/sampling_logp_difference/mean": 0.01994011551141739,
"step": 39,
"step_time": 17.24544241999979
},
{
"clip_ratio/high_max": 0.013523391913622618,
"clip_ratio/high_mean": 0.006761695956811309,
"clip_ratio/low_mean": 0.0016447368543595076,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.008406432811170816,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2440.0,
"completions/max_terminated_length": 2440.0,
"completions/mean_length": 1899.96875,
"completions/mean_terminated_length": 1899.96875,
"completions/min_length": 1053.0,
"completions/min_terminated_length": 1053.0,
"entropy": 0.09562211390584707,
"epoch": 0.0032,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.7564519047737122,
"kl": 0.012280215043574572,
"learning_rate": 7.999999619943787e-06,
"loss": 0.0388,
"num_tokens": 3004071.0,
"reward": 0.53125,
"reward_std": 0.13466876745224,
"rewards/rollout_reward_func/mean": 0.53125,
"rewards/rollout_reward_func/std": 0.40266650915145874,
"sampling/importance_sampling_ratio/max": 1.7533916234970093,
"sampling/importance_sampling_ratio/mean": 1.012838363647461,
"sampling/importance_sampling_ratio/min": 0.40302401781082153,
"sampling/sampling_logp_difference/max": 0.6195348501205444,
"sampling/sampling_logp_difference/mean": 0.016937807202339172,
"step": 40,
"step_time": 15.278612154999792
},
{
"clip_ratio/high_max": 0.013322473270818591,
"clip_ratio/high_mean": 0.0066612366354092956,
"clip_ratio/low_mean": 0.0016447368543595076,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.008305973489768803,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2818.0,
"completions/max_terminated_length": 2818.0,
"completions/mean_length": 2140.78125,
"completions/mean_terminated_length": 2140.78125,
"completions/min_length": 1052.0,
"completions/min_terminated_length": 1052.0,
"entropy": 0.10391843365505338,
"epoch": 0.00328,
"frac_reward_zero_std": 0.625,
"grad_norm": 0.684903621673584,
"kl": 0.0072205948672490194,
"learning_rate": 7.999999406162173e-06,
"loss": 0.0078,
"num_tokens": 3085711.0,
"reward": 0.41718751192092896,
"reward_std": 0.14279377460479736,
"rewards/rollout_reward_func/mean": 0.41718751192092896,
"rewards/rollout_reward_func/std": 0.33007559180259705,
"sampling/importance_sampling_ratio/max": 1.5918241739273071,
"sampling/importance_sampling_ratio/mean": 0.900505006313324,
"sampling/importance_sampling_ratio/min": 0.29988786578178406,
"sampling/sampling_logp_difference/max": 1.036886215209961,
"sampling/sampling_logp_difference/mean": 0.019227981567382812,
"step": 41,
"step_time": 16.941576514999497
},
{
"clip_ratio/high_max": 0.0032051282469183207,
"clip_ratio/high_mean": 0.0016025641234591603,
"clip_ratio/low_mean": 0.0035156250232830644,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005118189146742225,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2780.0,
"completions/max_terminated_length": 2780.0,
"completions/mean_length": 1944.25,
"completions/mean_terminated_length": 1944.25,
"completions/min_length": 1052.0,
"completions/min_terminated_length": 1052.0,
"entropy": 0.09584560617804527,
"epoch": 0.00336,
"frac_reward_zero_std": 0.5,
"grad_norm": 2.3526816368103027,
"kl": 0.027870278747286648,
"learning_rate": 7.999999144873542e-06,
"loss": 0.1103,
"num_tokens": 3160194.0,
"reward": 0.4490624964237213,
"reward_std": 0.20263297855854034,
"rewards/rollout_reward_func/mean": 0.4490624964237213,
"rewards/rollout_reward_func/std": 0.36338335275650024,
"sampling/importance_sampling_ratio/max": 2.6562483310699463,
"sampling/importance_sampling_ratio/mean": 1.036879062652588,
"sampling/importance_sampling_ratio/min": 0.24595271050930023,
"sampling/sampling_logp_difference/max": 0.9235069751739502,
"sampling/sampling_logp_difference/mean": 0.02009188011288643,
"step": 42,
"step_time": 16.650967574000333
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0066964286379516125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0066964286379516125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2791.0,
"completions/max_terminated_length": 2791.0,
"completions/mean_length": 1675.03125,
"completions/mean_terminated_length": 1675.03125,
"completions/min_length": 1052.0,
"completions/min_terminated_length": 1052.0,
"entropy": 0.12920551793649793,
"epoch": 0.00344,
"frac_reward_zero_std": 0.375,
"grad_norm": 1.3091895580291748,
"kl": 0.03491167013999075,
"learning_rate": 7.999998836077897e-06,
"loss": 0.0888,
"num_tokens": 3225903.0,
"reward": 0.5174999833106995,
"reward_std": 0.3608438968658447,
"rewards/rollout_reward_func/mean": 0.5174999833106995,
"rewards/rollout_reward_func/std": 0.43675488233566284,
"sampling/importance_sampling_ratio/max": 2.373917579650879,
"sampling/importance_sampling_ratio/mean": 1.0001801252365112,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.3759169578552246,
"sampling/sampling_logp_difference/mean": 0.02347693033516407,
"step": 43,
"step_time": 17.192637601000115
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2798.0,
"completions/max_terminated_length": 2798.0,
"completions/mean_length": 2095.3125,
"completions/mean_terminated_length": 2095.3125,
"completions/min_length": 1562.0,
"completions/min_terminated_length": 1562.0,
"entropy": 0.09558335272595286,
"epoch": 0.00352,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.6341590881347656,
"kl": 0.02431841316865757,
"learning_rate": 7.99999847977524e-06,
"loss": -0.0206,
"num_tokens": 3305945.0,
"reward": 0.5049999952316284,
"reward_std": 0.14433754980564117,
"rewards/rollout_reward_func/mean": 0.5049999952316284,
"rewards/rollout_reward_func/std": 0.3978976607322693,
"sampling/importance_sampling_ratio/max": 2.0796568393707275,
"sampling/importance_sampling_ratio/mean": 0.8876084089279175,
"sampling/importance_sampling_ratio/min": 0.34211352467536926,
"sampling/sampling_logp_difference/max": 1.0857441425323486,
"sampling/sampling_logp_difference/mean": 0.01929028518497944,
"step": 44,
"step_time": 16.607955621999963
},
{
"clip_ratio/high_max": 0.009783434681594372,
"clip_ratio/high_mean": 0.004891717340797186,
"clip_ratio/low_mean": 0.0034007353242486715,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.008292452665045857,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2816.0,
"completions/max_terminated_length": 2816.0,
"completions/mean_length": 2426.71875,
"completions/mean_terminated_length": 2426.71875,
"completions/min_length": 1052.0,
"completions/min_terminated_length": 1052.0,
"entropy": 0.13120493851602077,
"epoch": 0.0036,
"frac_reward_zero_std": 0.75,
"grad_norm": 1.2010557651519775,
"kl": 0.014120981490123086,
"learning_rate": 7.999998075965583e-06,
"loss": -0.0277,
"num_tokens": 3397449.0,
"reward": 0.3384374976158142,
"reward_std": 0.08029378205537796,
"rewards/rollout_reward_func/mean": 0.3384374976158142,
"rewards/rollout_reward_func/std": 0.21492847800254822,
"sampling/importance_sampling_ratio/max": 2.0331103801727295,
"sampling/importance_sampling_ratio/mean": 1.036205768585205,
"sampling/importance_sampling_ratio/min": 0.44029006361961365,
"sampling/sampling_logp_difference/max": 0.8087775707244873,
"sampling/sampling_logp_difference/mean": 0.021842796355485916,
"step": 45,
"step_time": 17.039232532999677
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.004464285913854837,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004464285913854837,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2820.0,
"completions/max_terminated_length": 2820.0,
"completions/mean_length": 2035.78125,
"completions/mean_terminated_length": 2035.78125,
"completions/min_length": 1053.0,
"completions/min_terminated_length": 1053.0,
"entropy": 0.10772825870662928,
"epoch": 0.00368,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.6874250173568726,
"kl": 0.02060257241828367,
"learning_rate": 7.99999762464893e-06,
"loss": 0.0037,
"num_tokens": 3475466.0,
"reward": 0.35374999046325684,
"reward_std": 0.13466878235340118,
"rewards/rollout_reward_func/mean": 0.35374999046325684,
"rewards/rollout_reward_func/std": 0.26349693536758423,
"sampling/importance_sampling_ratio/max": 2.4506301879882812,
"sampling/importance_sampling_ratio/mean": 1.0585436820983887,
"sampling/importance_sampling_ratio/min": 0.20336686074733734,
"sampling/sampling_logp_difference/max": 1.0288989543914795,
"sampling/sampling_logp_difference/mean": 0.019599031656980515,
"step": 46,
"step_time": 17.008759735000012
},
{
"clip_ratio/high_max": 0.007352941203862429,
"clip_ratio/high_mean": 0.0036764706019312143,
"clip_ratio/low_mean": 0.0018382353009656072,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0055147059028968215,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2796.0,
"completions/max_terminated_length": 2796.0,
"completions/mean_length": 1804.875,
"completions/mean_terminated_length": 1804.875,
"completions/min_length": 1052.0,
"completions/min_terminated_length": 1052.0,
"entropy": 0.11447951383888721,
"epoch": 0.00376,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.9577285051345825,
"kl": 0.0146886624279432,
"learning_rate": 7.999997125825284e-06,
"loss": 0.018,
"num_tokens": 3545642.0,
"reward": 0.550000011920929,
"reward_std": 0.25521132349967957,
"rewards/rollout_reward_func/mean": 0.550000011920929,
"rewards/rollout_reward_func/std": 0.4146043360233307,
"sampling/importance_sampling_ratio/max": 2.3037173748016357,
"sampling/importance_sampling_ratio/mean": 1.043008804321289,
"sampling/importance_sampling_ratio/min": 0.5624377727508545,
"sampling/sampling_logp_difference/max": 0.6249582767486572,
"sampling/sampling_logp_difference/mean": 0.017069321125745773,
"step": 47,
"step_time": 17.18442160299992
},
{
"clip_ratio/high_max": 0.011488970601931214,
"clip_ratio/high_mean": 0.005744485300965607,
"clip_ratio/low_mean": 0.003968254080973566,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009712739381939173,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2804.0,
"completions/max_terminated_length": 2804.0,
"completions/mean_length": 1970.9375,
"completions/mean_terminated_length": 1970.9375,
"completions/min_length": 1053.0,
"completions/min_terminated_length": 1053.0,
"entropy": 0.09903696551918983,
"epoch": 0.00384,
"frac_reward_zero_std": 0.625,
"grad_norm": 1.1492125988006592,
"kl": 0.018403344380203635,
"learning_rate": 7.999996579494655e-06,
"loss": 0.0456,
"num_tokens": 3621220.0,
"reward": 0.4325000047683716,
"reward_std": 0.19716876745224,
"rewards/rollout_reward_func/mean": 0.4325000047683716,
"rewards/rollout_reward_func/std": 0.3565334677696228,
"sampling/importance_sampling_ratio/max": 2.369974136352539,
"sampling/importance_sampling_ratio/mean": 1.0478521585464478,
"sampling/importance_sampling_ratio/min": 0.11733747273683548,
"sampling/sampling_logp_difference/max": 1.4917361736297607,
"sampling/sampling_logp_difference/mean": 0.022218499332666397,
"step": 48,
"step_time": 16.41912825899999
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2804.0,
"completions/max_terminated_length": 2804.0,
"completions/mean_length": 2252.875,
"completions/mean_terminated_length": 2252.875,
"completions/min_length": 1565.0,
"completions/min_terminated_length": 1565.0,
"entropy": 0.09296703850850463,
"epoch": 0.00392,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.5641649961471558,
"kl": 0.03362530213780701,
"learning_rate": 7.999995985657054e-06,
"loss": -0.0171,
"num_tokens": 3706955.0,
"reward": 0.42624998092651367,
"reward_std": 0.13466876745224,
"rewards/rollout_reward_func/mean": 0.42624998092651367,
"rewards/rollout_reward_func/std": 0.3314265012741089,
"sampling/importance_sampling_ratio/max": 1.5233134031295776,
"sampling/importance_sampling_ratio/mean": 0.9442777633666992,
"sampling/importance_sampling_ratio/min": 0.2804383933544159,
"sampling/sampling_logp_difference/max": 1.1384481191635132,
"sampling/sampling_logp_difference/mean": 0.01721033826470375,
"step": 49,
"step_time": 16.923561193999603
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2820.0,
"completions/max_terminated_length": 2820.0,
"completions/mean_length": 2463.8125,
"completions/mean_terminated_length": 2463.8125,
"completions/min_length": 2040.0,
"completions/min_terminated_length": 2040.0,
"entropy": 0.09066143818199635,
"epoch": 0.004,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.10026834160089493,
"kl": 0.030927304484066553,
"learning_rate": 7.99999534431249e-06,
"loss": 0.0004,
"num_tokens": 3799818.0,
"reward": 0.30000001192092896,
"reward_std": 0.0,
"rewards/rollout_reward_func/mean": 0.30000001192092896,
"rewards/rollout_reward_func/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.050495147705078,
"sampling/importance_sampling_ratio/mean": 1.0762633085250854,
"sampling/importance_sampling_ratio/min": 0.29474106431007385,
"sampling/sampling_logp_difference/max": 1.1381915807724,
"sampling/sampling_logp_difference/mean": 0.01666702888906002,
"step": 50,
"step_time": 17.35131004799996
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0018382353009656072,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0018382353009656072,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2786.0,
"completions/max_terminated_length": 2786.0,
"completions/mean_length": 1773.84375,
"completions/mean_terminated_length": 1773.84375,
"completions/min_length": 1052.0,
"completions/min_terminated_length": 1052.0,
"entropy": 0.06475613545626402,
"epoch": 0.00408,
"frac_reward_zero_std": 0.625,
"grad_norm": 3.30958890914917,
"kl": 0.015178573405137286,
"learning_rate": 7.99999465546097e-06,
"loss": 0.0343,
"num_tokens": 3868368.0,
"reward": 0.49562498927116394,
"reward_std": 0.1848391890525818,
"rewards/rollout_reward_func/mean": 0.49562498927116394,
"rewards/rollout_reward_func/std": 0.4042271077632904,
"sampling/importance_sampling_ratio/max": 2.036992311477661,
"sampling/importance_sampling_ratio/mean": 1.0444798469543457,
"sampling/importance_sampling_ratio/min": 0.6844988465309143,
"sampling/sampling_logp_difference/max": 0.4961543083190918,
"sampling/sampling_logp_difference/mean": 0.009073879569768906,
"step": 51,
"step_time": 17.00100525200014
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2809.0,
"completions/max_terminated_length": 2809.0,
"completions/mean_length": 1786.375,
"completions/mean_terminated_length": 1786.375,
"completions/min_length": 1052.0,
"completions/min_terminated_length": 1052.0,
"entropy": 0.08096808800473809,
"epoch": 0.00416,
"frac_reward_zero_std": 0.375,
"grad_norm": 1.6158044338226318,
"kl": 0.020677090127719566,
"learning_rate": 7.99999391910251e-06,
"loss": -0.0195,
"num_tokens": 3937756.0,
"reward": 0.4912499785423279,
"reward_std": 0.33183753490448,
"rewards/rollout_reward_func/mean": 0.4912499785423279,
"rewards/rollout_reward_func/std": 0.42849886417388916,
"sampling/importance_sampling_ratio/max": 1.5450013875961304,
"sampling/importance_sampling_ratio/mean": 0.9464795589447021,
"sampling/importance_sampling_ratio/min": 0.39816370606422424,
"sampling/sampling_logp_difference/max": 1.0278459787368774,
"sampling/sampling_logp_difference/mean": 0.015709228813648224,
"step": 52,
"step_time": 16.014014809000173
},
{
"clip_ratio/high_max": 0.003289473708719015,
"clip_ratio/high_mean": 0.0016447368543595076,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0016447368543595076,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2799.0,
"completions/max_terminated_length": 2799.0,
"completions/mean_length": 1632.4375,
"completions/mean_terminated_length": 1632.4375,
"completions/min_length": 1054.0,
"completions/min_terminated_length": 1054.0,
"entropy": 0.06091495987493545,
"epoch": 0.00424,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.7226957678794861,
"kl": 0.017386693391017616,
"learning_rate": 7.999993135237117e-06,
"loss": 0.0201,
"num_tokens": 4001420.0,
"reward": 0.6793749928474426,
"reward_std": 0.2570079565048218,
"rewards/rollout_reward_func/mean": 0.6793749928474426,
"rewards/rollout_reward_func/std": 0.46435481309890747,
"sampling/importance_sampling_ratio/max": 1.579625129699707,
"sampling/importance_sampling_ratio/mean": 0.9135901927947998,
"sampling/importance_sampling_ratio/min": 0.35018137097358704,
"sampling/sampling_logp_difference/max": 1.1071686744689941,
"sampling/sampling_logp_difference/mean": 0.015183830633759499,
"step": 53,
"step_time": 15.960780017000161
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2809.0,
"completions/max_terminated_length": 2809.0,
"completions/mean_length": 1589.6875,
"completions/mean_terminated_length": 1589.6875,
"completions/min_length": 1052.0,
"completions/min_terminated_length": 1052.0,
"entropy": 0.054683255730196834,
"epoch": 0.00432,
"frac_reward_zero_std": 0.375,
"grad_norm": 1.0967762470245361,
"kl": 0.021693169721402228,
"learning_rate": 7.999992303864804e-06,
"loss": -0.0452,
"num_tokens": 4063548.0,
"reward": 0.7262499928474426,
"reward_std": 0.3071783781051636,
"rewards/rollout_reward_func/mean": 0.7262499928474426,
"rewards/rollout_reward_func/std": 0.45173320174217224,
"sampling/importance_sampling_ratio/max": 1.3698084354400635,
"sampling/importance_sampling_ratio/mean": 0.9228720664978027,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.3379874229431152,
"sampling/sampling_logp_difference/mean": 0.013593094423413277,
"step": 54,
"step_time": 16.424249653999823
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0057151151122525334,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0057151151122525334,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2435.0,
"completions/max_terminated_length": 2435.0,
"completions/mean_length": 1791.34375,
"completions/mean_terminated_length": 1791.34375,
"completions/min_length": 1054.0,
"completions/min_terminated_length": 1054.0,
"entropy": 0.06585042458027601,
"epoch": 0.0044,
"frac_reward_zero_std": 0.625,
"grad_norm": 1.214758038520813,
"kl": 0.024209468625485897,
"learning_rate": 7.999991424985586e-06,
"loss": 0.0112,
"num_tokens": 4132756.0,
"reward": 0.4637500047683716,
"reward_std": 0.19184717535972595,
"rewards/rollout_reward_func/mean": 0.4637500047683716,
"rewards/rollout_reward_func/std": 0.35306718945503235,
"sampling/importance_sampling_ratio/max": 2.3594348430633545,
"sampling/importance_sampling_ratio/mean": 1.098282814025879,
"sampling/importance_sampling_ratio/min": 0.3494420647621155,
"sampling/sampling_logp_difference/max": 1.4460781812667847,
"sampling/sampling_logp_difference/mean": 0.01887095905840397,
"step": 55,
"step_time": 15.803478434999533
},
{
"clip_ratio/high_max": 0.003289473708719015,
"clip_ratio/high_mean": 0.0016447368543595076,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0016447368543595076,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2794.0,
"completions/max_terminated_length": 2794.0,
"completions/mean_length": 2308.09375,
"completions/mean_terminated_length": 2308.09375,
"completions/min_length": 1568.0,
"completions/min_terminated_length": 1568.0,
"entropy": 0.09696381096728146,
"epoch": 0.00448,
"frac_reward_zero_std": 0.875,
"grad_norm": 0.32451269030570984,
"kl": 0.019486179284285754,
"learning_rate": 7.999990498599477e-06,
"loss": -0.0013,
"num_tokens": 4220279.0,
"reward": 0.3631250262260437,
"reward_std": 0.05983918905258179,
"rewards/rollout_reward_func/mean": 0.3631250262260437,
"rewards/rollout_reward_func/std": 0.2257665991783142,
"sampling/importance_sampling_ratio/max": 1.3443750143051147,
"sampling/importance_sampling_ratio/mean": 0.9363906979560852,
"sampling/importance_sampling_ratio/min": 0.3993731737136841,
"sampling/sampling_logp_difference/max": 0.668013334274292,
"sampling/sampling_logp_difference/mean": 0.01326768472790718,
"step": 56,
"step_time": 16.922060316999477
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2460.0,
"completions/max_terminated_length": 2460.0,
"completions/mean_length": 1963.34375,
"completions/mean_terminated_length": 1963.34375,
"completions/min_length": 1054.0,
"completions/min_terminated_length": 1054.0,
"entropy": 0.04914194135926664,
"epoch": 0.00456,
"frac_reward_zero_std": 0.875,
"grad_norm": 0.5049991607666016,
"kl": 0.02837482520226331,
"learning_rate": 7.99998952470649e-06,
"loss": -0.008,
"num_tokens": 4295471.0,
"reward": 0.48374998569488525,
"reward_std": 0.0624999962747097,
"rewards/rollout_reward_func/mean": 0.48374998569488525,
"rewards/rollout_reward_func/std": 0.3627649247646332,
"sampling/importance_sampling_ratio/max": 1.912903904914856,
"sampling/importance_sampling_ratio/mean": 0.9784045219421387,
"sampling/importance_sampling_ratio/min": 0.4301539659500122,
"sampling/sampling_logp_difference/max": 0.8482755422592163,
"sampling/sampling_logp_difference/mean": 0.010939370840787888,
"step": 57,
"step_time": 15.453092248000303
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0034007353242486715,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0034007353242486715,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2787.0,
"completions/max_terminated_length": 2787.0,
"completions/mean_length": 1872.28125,
"completions/mean_terminated_length": 1872.28125,
"completions/min_length": 1054.0,
"completions/min_terminated_length": 1054.0,
"entropy": 0.06335801596287638,
"epoch": 0.00464,
"frac_reward_zero_std": 0.625,
"grad_norm": 0.6874297261238098,
"kl": 0.03325861978373723,
"learning_rate": 7.999988503306642e-06,
"loss": -0.0215,
"num_tokens": 4367703.0,
"reward": 0.4325000047683716,
"reward_std": 0.19716876745224,
"rewards/rollout_reward_func/mean": 0.4325000047683716,
"rewards/rollout_reward_func/std": 0.3528958261013031,
"sampling/importance_sampling_ratio/max": 1.3908125162124634,
"sampling/importance_sampling_ratio/mean": 0.8833715319633484,
"sampling/importance_sampling_ratio/min": 1.9220989599944005e-07,
"sampling/sampling_logp_difference/max": 13.085790634155273,
"sampling/sampling_logp_difference/mean": 0.04347304627299309,
"step": 58,
"step_time": 16.225704187000247
},
{
"clip_ratio/high_max": 0.004464285913854837,
"clip_ratio/high_mean": 0.0022321429569274187,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0022321429569274187,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2810.0,
"completions/max_terminated_length": 2810.0,
"completions/mean_length": 2212.5,
"completions/mean_terminated_length": 2212.5,
"completions/min_length": 1053.0,
"completions/min_terminated_length": 1053.0,
"entropy": 0.0681739835999906,
"epoch": 0.00472,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.8530667424201965,
"kl": 0.03184444556973176,
"learning_rate": 7.999987434399948e-06,
"loss": 0.0114,
"num_tokens": 4452096.0,
"reward": 0.4006250202655792,
"reward_std": 0.13200798630714417,
"rewards/rollout_reward_func/mean": 0.4006250202655792,
"rewards/rollout_reward_func/std": 0.29461774230003357,
"sampling/importance_sampling_ratio/max": 1.6479169130325317,
"sampling/importance_sampling_ratio/mean": 1.004500150680542,
"sampling/importance_sampling_ratio/min": 0.30276352167129517,
"sampling/sampling_logp_difference/max": 1.1370731592178345,
"sampling/sampling_logp_difference/mean": 0.014959340915083885,
"step": 59,
"step_time": 17.880038248999654
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0031250000465661287,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0031250000465661287,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2438.0,
"completions/max_terminated_length": 2438.0,
"completions/mean_length": 2130.46875,
"completions/mean_terminated_length": 2130.46875,
"completions/min_length": 1567.0,
"completions/min_terminated_length": 1567.0,
"entropy": 0.055047230795025826,
"epoch": 0.0048,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.3532449007034302,
"kl": 0.01998938515316695,
"learning_rate": 7.999986317986426e-06,
"loss": -0.0728,
"num_tokens": 4533067.0,
"reward": 0.3746874928474426,
"reward_std": 0.07062499970197678,
"rewards/rollout_reward_func/mean": 0.3746874928474426,
"rewards/rollout_reward_func/std": 0.26494044065475464,
"sampling/importance_sampling_ratio/max": 1.6606436967849731,
"sampling/importance_sampling_ratio/mean": 0.94105064868927,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 0.8746337890625,
"sampling/sampling_logp_difference/mean": 0.013816887512803078,
"step": 60,
"step_time": 15.591869645000315
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0022321429569274187,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0022321429569274187,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2772.0,
"completions/max_terminated_length": 2772.0,
"completions/mean_length": 1693.875,
"completions/mean_terminated_length": 1693.875,
"completions/min_length": 1052.0,
"completions/min_terminated_length": 1052.0,
"entropy": 0.05530149070546031,
"epoch": 0.00488,
"frac_reward_zero_std": 0.375,
"grad_norm": 1.1544066667556763,
"kl": 0.016453518153866753,
"learning_rate": 7.999985154066091e-06,
"loss": 0.0093,
"num_tokens": 4598963.0,
"reward": 0.5693750381469727,
"reward_std": 0.32917672395706177,
"rewards/rollout_reward_func/mean": 0.5693750381469727,
"rewards/rollout_reward_func/std": 0.4425143301486969,
"sampling/importance_sampling_ratio/max": 2.6382031440734863,
"sampling/importance_sampling_ratio/mean": 1.0790200233459473,
"sampling/importance_sampling_ratio/min": 0.7093254327774048,
"sampling/sampling_logp_difference/max": 0.6202226877212524,
"sampling/sampling_logp_difference/mean": 0.011032961308956146,
"step": 61,
"step_time": 16.082178944999896
},
{
"clip_ratio/high_max": 0.0062500000931322575,
"clip_ratio/high_mean": 0.0031250000465661287,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0031250000465661287,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2806.0,
"completions/max_terminated_length": 2806.0,
"completions/mean_length": 2304.9375,
"completions/mean_terminated_length": 2304.9375,
"completions/min_length": 1559.0,
"completions/min_terminated_length": 1559.0,
"entropy": 0.06317996443249285,
"epoch": 0.00496,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.9210407137870789,
"kl": 0.019177823327481747,
"learning_rate": 7.999983942638965e-06,
"loss": -0.0262,
"num_tokens": 4686091.0,
"reward": 0.3434374928474426,
"reward_std": 0.08029377460479736,
"rewards/rollout_reward_func/mean": 0.3434374928474426,
"rewards/rollout_reward_func/std": 0.22245851159095764,
"sampling/importance_sampling_ratio/max": 1.911454677581787,
"sampling/importance_sampling_ratio/mean": 0.9809565544128418,
"sampling/importance_sampling_ratio/min": 0.25244244933128357,
"sampling/sampling_logp_difference/max": 0.9257916212081909,
"sampling/sampling_logp_difference/mean": 0.013646715320646763,
"step": 62,
"step_time": 16.97750502200006
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2421.0,
"completions/max_terminated_length": 2421.0,
"completions/mean_length": 1974.96875,
"completions/mean_terminated_length": 1974.96875,
"completions/min_length": 1054.0,
"completions/min_terminated_length": 1054.0,
"entropy": 0.04369071568362415,
"epoch": 0.00504,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.14146260917186737,
"kl": 0.05203759076539427,
"learning_rate": 7.999982683705066e-06,
"loss": 0.0006,
"num_tokens": 4761655.0,
"reward": 0.2800000011920929,
"reward_std": 0.0,
"rewards/rollout_reward_func/mean": 0.2800000011920929,
"rewards/rollout_reward_func/std": 0.05376172438263893,
"sampling/importance_sampling_ratio/max": 2.2244811058044434,
"sampling/importance_sampling_ratio/mean": 1.0279231071472168,
"sampling/importance_sampling_ratio/min": 0.3840067684650421,
"sampling/sampling_logp_difference/max": 0.9096496105194092,
"sampling/sampling_logp_difference/mean": 0.010698029771447182,
"step": 63,
"step_time": 16.219797216000416
},
{
"clip_ratio/high_max": 0.015190972248092294,
"clip_ratio/high_mean": 0.007595486124046147,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.007595486124046147,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2815.0,
"completions/max_terminated_length": 2815.0,
"completions/mean_length": 1982.6875,
"completions/mean_terminated_length": 1982.6875,
"completions/min_length": 1053.0,
"completions/min_terminated_length": 1053.0,
"entropy": 0.07828675024211407,
"epoch": 0.00512,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.7813690304756165,
"kl": 0.021903582994127646,
"learning_rate": 7.999981377264413e-06,
"loss": 0.0011,
"num_tokens": 4838046.0,
"reward": 0.5843750238418579,
"reward_std": 0.22841876745224,
"rewards/rollout_reward_func/mean": 0.5843750238418579,
"rewards/rollout_reward_func/std": 0.4214792251586914,
"sampling/importance_sampling_ratio/max": 1.827079176902771,
"sampling/importance_sampling_ratio/mean": 1.0283104181289673,
"sampling/importance_sampling_ratio/min": 0.25544387102127075,
"sampling/sampling_logp_difference/max": 1.0380005836486816,
"sampling/sampling_logp_difference/mean": 0.015725988894701004,
"step": 64,
"step_time": 16.373142061000408
},
{
"clip_ratio/high_max": 0.0036764706019312143,
"clip_ratio/high_mean": 0.0018382353009656072,
"clip_ratio/low_mean": 0.0018382353009656072,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0036764706019312143,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2801.0,
"completions/max_terminated_length": 2801.0,
"completions/mean_length": 2009.3125,
"completions/mean_terminated_length": 2009.3125,
"completions/min_length": 1053.0,
"completions/min_terminated_length": 1053.0,
"entropy": 0.05688254698179662,
"epoch": 0.0052,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.2757700979709625,
"kl": 0.036635934142395854,
"learning_rate": 7.999980023317026e-06,
"loss": 0.0247,
"num_tokens": 4914779.0,
"reward": 0.45250001549720764,
"reward_std": 0.125,
"rewards/rollout_reward_func/mean": 0.45250001549720764,
"rewards/rollout_reward_func/std": 0.3471450209617615,
"sampling/importance_sampling_ratio/max": 1.4275991916656494,
"sampling/importance_sampling_ratio/mean": 0.908840537071228,
"sampling/importance_sampling_ratio/min": 0.23684662580490112,
"sampling/sampling_logp_difference/max": 1.4337669610977173,
"sampling/sampling_logp_difference/mean": 0.017221834510564804,
"step": 65,
"step_time": 16.79740752900011
},
{
"clip_ratio/high_max": 0.0036764706019312143,
"clip_ratio/high_mean": 0.0018382353009656072,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0018382353009656072,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2804.0,
"completions/max_terminated_length": 2804.0,
"completions/mean_length": 2059.75,
"completions/mean_terminated_length": 2059.75,
"completions/min_length": 1053.0,
"completions/min_terminated_length": 1053.0,
"entropy": 0.07610421534627676,
"epoch": 0.00528,
"frac_reward_zero_std": 0.75,
"grad_norm": 1.4526886940002441,
"kl": 0.027375709789339453,
"learning_rate": 7.999978621862929e-06,
"loss": 0.0203,
"num_tokens": 4993633.0,
"reward": 0.4793750047683716,
"reward_std": 0.13200797140598297,
"rewards/rollout_reward_func/mean": 0.4793750047683716,
"rewards/rollout_reward_func/std": 0.3699514865875244,
"sampling/importance_sampling_ratio/max": 1.815674066543579,
"sampling/importance_sampling_ratio/mean": 1.0289226770401,
"sampling/importance_sampling_ratio/min": 0.5281765460968018,
"sampling/sampling_logp_difference/max": 0.6608150005340576,
"sampling/sampling_logp_difference/mean": 0.01253808755427599,
"step": 66,
"step_time": 16.794041812999467
},
{
"clip_ratio/high_max": 0.011259191203862429,
"clip_ratio/high_mean": 0.005629595601931214,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.007582720601931214,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2812.0,
"completions/max_terminated_length": 2812.0,
"completions/mean_length": 1777.84375,
"completions/mean_terminated_length": 1777.84375,
"completions/min_length": 1053.0,
"completions/min_terminated_length": 1053.0,
"entropy": 0.06151273613795638,
"epoch": 0.00536,
"frac_reward_zero_std": 0.5,
"grad_norm": 1.1407759189605713,
"kl": 0.02162244959617965,
"learning_rate": 7.999977172902144e-06,
"loss": -0.0017,
"num_tokens": 5062556.0,
"reward": 0.48000001907348633,
"reward_std": 0.25434714555740356,
"rewards/rollout_reward_func/mean": 0.48000001907348633,
"rewards/rollout_reward_func/std": 0.3861806094646454,
"sampling/importance_sampling_ratio/max": 1.6365498304367065,
"sampling/importance_sampling_ratio/mean": 1.0415589809417725,
"sampling/importance_sampling_ratio/min": 0.4183502197265625,
"sampling/sampling_logp_difference/max": 0.8754826188087463,
"sampling/sampling_logp_difference/mean": 0.014012180268764496,
"step": 67,
"step_time": 17.151742474000002
},
{
"clip_ratio/high_max": 0.0058139534667134285,
"clip_ratio/high_mean": 0.0029069767333567142,
"clip_ratio/low_mean": 0.0031250000465661287,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006031976779922843,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2778.0,
"completions/max_terminated_length": 2778.0,
"completions/mean_length": 2272.375,
"completions/mean_terminated_length": 2272.375,
"completions/min_length": 2023.0,
"completions/min_terminated_length": 2023.0,
"entropy": 0.08835586486384273,
"epoch": 0.00544,
"frac_reward_zero_std": 0.75,
"grad_norm": 1.3424510955810547,
"kl": 0.030128376907669008,
"learning_rate": 7.999975676434692e-06,
"loss": -0.0562,
"num_tokens": 5148670.0,
"reward": 0.2918750047683716,
"reward_std": 0.0162500012665987,
"rewards/rollout_reward_func/mean": 0.2918750047683716,
"rewards/rollout_reward_func/std": 0.031971510499715805,
"sampling/importance_sampling_ratio/max": 2.0581729412078857,
"sampling/importance_sampling_ratio/mean": 1.0349149703979492,
"sampling/importance_sampling_ratio/min": 0.4890825152397156,
"sampling/sampling_logp_difference/max": 0.7274646759033203,
"sampling/sampling_logp_difference/mean": 0.019628014415502548,
"step": 68,
"step_time": 16.927781462999747
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2434.0,
"completions/max_terminated_length": 2434.0,
"completions/mean_length": 1876.4375,
"completions/mean_terminated_length": 1876.4375,
"completions/min_length": 1052.0,
"completions/min_terminated_length": 1052.0,
"entropy": 0.0765807363204658,
"epoch": 0.00552,
"frac_reward_zero_std": 0.625,
"grad_norm": 0.9624868631362915,
"kl": 0.05591569049283862,
"learning_rate": 7.999974132460596e-06,
"loss": 0.0124,
"num_tokens": 5221035.0,
"reward": 0.4793750047683716,
"reward_std": 0.17558756470680237,
"rewards/rollout_reward_func/mean": 0.4793750047683716,
"rewards/rollout_reward_func/std": 0.3699514865875244,
"sampling/importance_sampling_ratio/max": 1.3370881080627441,
"sampling/importance_sampling_ratio/mean": 0.9259651899337769,
"sampling/importance_sampling_ratio/min": 0.3385222852230072,
"sampling/sampling_logp_difference/max": 1.2049891948699951,
"sampling/sampling_logp_difference/mean": 0.014953669160604477,
"step": 69,
"step_time": 15.703423997999835
},
{
"clip_ratio/high_max": 0.0069444444961845875,
"clip_ratio/high_mean": 0.0034722222480922937,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0034722222480922937,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2437.0,
"completions/max_terminated_length": 2437.0,
"completions/mean_length": 1896.46875,
"completions/mean_terminated_length": 1896.46875,
"completions/min_length": 1054.0,
"completions/min_terminated_length": 1054.0,
"entropy": 0.05552901164628565,
"epoch": 0.0056,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.2667936384677887,
"kl": 0.014588030888262438,
"learning_rate": 7.999972540979884e-06,
"loss": 0.0088,
"num_tokens": 5294017.0,
"reward": 0.5,
"reward_std": 0.125,
"rewards/rollout_reward_func/mean": 0.5,
"rewards/rollout_reward_func/std": 0.39909571409225464,
"sampling/importance_sampling_ratio/max": 1.607146143913269,
"sampling/importance_sampling_ratio/mean": 0.9701419472694397,
"sampling/importance_sampling_ratio/min": 0.401731014251709,
"sampling/sampling_logp_difference/max": 1.0000518560409546,
"sampling/sampling_logp_difference/mean": 0.010325020179152489,
"step": 70,
"step_time": 15.35116849200017
},
{
"clip_ratio/high_max": 0.0069659443106502295,
"clip_ratio/high_mean": 0.0034829721553251147,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0034829721553251147,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2805.0,
"completions/max_terminated_length": 2805.0,
"completions/mean_length": 2170.65625,
"completions/mean_terminated_length": 2170.65625,
"completions/min_length": 1053.0,
"completions/min_terminated_length": 1053.0,
"entropy": 0.09610213804990053,
"epoch": 0.00568,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.5360183119773865,
"kl": 0.029458090604748577,
"learning_rate": 7.99997090199258e-06,
"loss": 0.0331,
"num_tokens": 5376683.0,
"reward": 0.515625,
"reward_std": 0.12233918905258179,
"rewards/rollout_reward_func/mean": 0.515625,
"rewards/rollout_reward_func/std": 0.3893662095069885,
"sampling/importance_sampling_ratio/max": 2.599726438522339,
"sampling/importance_sampling_ratio/mean": 0.9910818934440613,
"sampling/importance_sampling_ratio/min": 0.38567137718200684,
"sampling/sampling_logp_difference/max": 0.9569785594940186,
"sampling/sampling_logp_difference/mean": 0.02182850055396557,
"step": 71,
"step_time": 18.631503620000103
},
{
"clip_ratio/high_max": 0.0034722222480922937,
"clip_ratio/high_mean": 0.0017361111240461469,
"clip_ratio/low_mean": 0.0018382353009656072,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.003574346425011754,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2434.0,
"completions/max_terminated_length": 2434.0,
"completions/mean_length": 1713.6875,
"completions/mean_terminated_length": 1713.6875,
"completions/min_length": 1053.0,
"completions/min_terminated_length": 1053.0,
"entropy": 0.05361162032932043,
"epoch": 0.00576,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.6218546628952026,
"kl": 0.02329419369925745,
"learning_rate": 7.999969215498707e-06,
"loss": -0.0147,
"num_tokens": 5443189.0,
"reward": 0.6181250214576721,
"reward_std": 0.2755875587463379,
"rewards/rollout_reward_func/mean": 0.6181250214576721,
"rewards/rollout_reward_func/std": 0.45625001192092896,
"sampling/importance_sampling_ratio/max": 1.4299272298812866,
"sampling/importance_sampling_ratio/mean": 0.9309602975845337,
"sampling/importance_sampling_ratio/min": 0.39184144139289856,
"sampling/sampling_logp_difference/max": 0.9363220930099487,
"sampling/sampling_logp_difference/mean": 0.011649301275610924,
"step": 72,
"step_time": 15.753308050000896
},
{
"clip_ratio/high_max": 0.011660009622573853,
"clip_ratio/high_mean": 0.005830004811286926,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.007566115935333073,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2801.0,
"completions/max_terminated_length": 2801.0,
"completions/mean_length": 1721.4375,
"completions/mean_terminated_length": 1721.4375,
"completions/min_length": 1052.0,
"completions/min_terminated_length": 1052.0,
"entropy": 0.06148386397399008,
"epoch": 0.00584,
"frac_reward_zero_std": 0.375,
"grad_norm": 1.5323787927627563,
"kl": 0.02491366575122811,
"learning_rate": 7.999967481498294e-06,
"loss": 0.0102,
"num_tokens": 5509968.0,
"reward": 0.5487499833106995,
"reward_std": 0.33183753490448,
"rewards/rollout_reward_func/mean": 0.5487499833106995,
"rewards/rollout_reward_func/std": 0.4589731991291046,
"sampling/importance_sampling_ratio/max": 1.9993343353271484,
"sampling/importance_sampling_ratio/mean": 0.9622111916542053,
"sampling/importance_sampling_ratio/min": 0.34618350863456726,
"sampling/sampling_logp_difference/max": 1.054746150970459,
"sampling/sampling_logp_difference/mean": 0.013135725632309914,
"step": 73,
"step_time": 16.009405589999915
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2792.0,
"completions/max_terminated_length": 2792.0,
"completions/mean_length": 1907.75,
"completions/mean_terminated_length": 1907.75,
"completions/min_length": 1052.0,
"completions/min_terminated_length": 1052.0,
"entropy": 0.08447153866291046,
"epoch": 0.00592,
"frac_reward_zero_std": 0.625,
"grad_norm": 1.2818049192428589,
"kl": 0.03082829067716375,
"learning_rate": 7.999965699991369e-06,
"loss": 0.0268,
"num_tokens": 5583748.0,
"reward": 0.44875001907348633,
"reward_std": 0.20683756470680237,
"rewards/rollout_reward_func/mean": 0.44875001907348633,
"rewards/rollout_reward_func/std": 0.3857104480266571,
"sampling/importance_sampling_ratio/max": 1.9656041860580444,
"sampling/importance_sampling_ratio/mean": 1.0510060787200928,
"sampling/importance_sampling_ratio/min": 0.4638214409351349,
"sampling/sampling_logp_difference/max": 0.651539146900177,
"sampling/sampling_logp_difference/mean": 0.016923408955335617,
"step": 74,
"step_time": 16.37548145899973
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0016447368543595076,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0016447368543595076,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2794.0,
"completions/max_terminated_length": 2794.0,
"completions/mean_length": 1590.5,
"completions/mean_terminated_length": 1590.5,
"completions/min_length": 1054.0,
"completions/min_terminated_length": 1054.0,
"entropy": 0.051554064732044935,
"epoch": 0.006,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.4656570851802826,
"kl": 0.039722154615446925,
"learning_rate": 7.99996387097796e-06,
"loss": -0.0273,
"num_tokens": 5646147.0,
"reward": 0.8512499928474426,
"reward_std": 0.13466876745224,
"rewards/rollout_reward_func/mean": 0.8512499928474426,
"rewards/rollout_reward_func/std": 0.4309011399745941,
"sampling/importance_sampling_ratio/max": 2.459237575531006,
"sampling/importance_sampling_ratio/mean": 1.0666892528533936,
"sampling/importance_sampling_ratio/min": 0.403707355260849,
"sampling/sampling_logp_difference/max": 0.9005355834960938,
"sampling/sampling_logp_difference/mean": 0.012501123361289501,
"step": 75,
"step_time": 16.584246522000285
},
{
"epoch": 0.006,
"eval_clip_ratio/high_max": 0.0,
"eval_clip_ratio/high_mean": 0.0,
"eval_clip_ratio/low_mean": 0.0,
"eval_clip_ratio/low_min": 0.0,
"eval_clip_ratio/region_mean": 0.0,
"eval_completions/clipped_ratio": 0.0,
"eval_completions/max_length": 2216.95,
"eval_completions/max_terminated_length": 2216.95,
"eval_completions/mean_length": 1925.65,
"eval_completions/mean_terminated_length": 1925.65,
"eval_completions/min_length": 1634.35,
"eval_completions/min_terminated_length": 1634.35,
"eval_entropy": 0.07575540114194154,
"eval_frac_reward_zero_std": 1.0,
"eval_kl": 0.031034281105894478,
"eval_loss": 4.469734994927421e-05,
"eval_num_tokens": 5646147.0,
"eval_reward": 0.6004999987781048,
"eval_reward_std": 0.0,
"eval_rewards/rollout_reward_func/mean": 0.6004999987781048,
"eval_rewards/rollout_reward_func/std": 0.24536602906882762,
"eval_runtime": 13.987,
"eval_samples_per_second": 0.715,
"eval_sampling/importance_sampling_ratio/max": 1.2220476478338242,
"eval_sampling/importance_sampling_ratio/mean": 1.0154326111078262,
"eval_sampling/importance_sampling_ratio/min": 0.8088175728917122,
"eval_sampling/sampling_logp_difference/max": 0.25984298419207336,
"eval_sampling/sampling_logp_difference/mean": 0.014358489285223186,
"eval_steps_per_second": 0.357,
"step": 75
}
],
"logging_steps": 1.0,
"max_steps": 25000,
"num_input_tokens_seen": 5646147,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}