gin_rummy_4G / trainer_state.json
Gege24's picture
Upload task output 1
e35331b verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.006,
"eval_steps": 500,
"global_step": 75,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2783.0,
"completions/max_terminated_length": 2783.0,
"completions/mean_length": 2052.75,
"completions/mean_terminated_length": 2052.75,
"completions/min_length": 1055.0,
"completions/min_terminated_length": 1055.0,
"entropy": 0.4163087382912636,
"epoch": 8e-05,
"frac_reward_zero_std": 0.375,
"grad_norm": 1.8663769960403442,
"kl": 0.0,
"learning_rate": 0.0,
"loss": -0.0386,
"num_tokens": 78630.0,
"reward": 0.46406251192092896,
"reward_std": 0.20054946839809418,
"rewards/rollout_reward_func/mean": 0.46406251192092896,
"rewards/rollout_reward_func/std": 0.37604784965515137,
"sampling/importance_sampling_ratio/max": 2.1498024463653564,
"sampling/importance_sampling_ratio/mean": 1.0975958108901978,
"sampling/importance_sampling_ratio/min": 0.241215318441391,
"sampling/sampling_logp_difference/max": 0.7405228614807129,
"sampling/sampling_logp_difference/mean": 0.039819031953811646,
"step": 1,
"step_time": 14.418279634999976
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2798.0,
"completions/max_terminated_length": 2798.0,
"completions/mean_length": 2084.21875,
"completions/mean_terminated_length": 2084.21875,
"completions/min_length": 1052.0,
"completions/min_terminated_length": 1052.0,
"entropy": 0.3995310440659523,
"epoch": 0.00016,
"frac_reward_zero_std": 0.25,
"grad_norm": 2.142817735671997,
"kl": 0.0,
"learning_rate": 1.7142857142857143e-07,
"loss": 0.016,
"num_tokens": 158194.0,
"reward": 0.3384375274181366,
"reward_std": 0.16842570900917053,
"rewards/rollout_reward_func/mean": 0.3384375274181366,
"rewards/rollout_reward_func/std": 0.27340278029441833,
"sampling/importance_sampling_ratio/max": 1.9602876901626587,
"sampling/importance_sampling_ratio/mean": 0.992855966091156,
"sampling/importance_sampling_ratio/min": 0.46628525853157043,
"sampling/sampling_logp_difference/max": 0.6929764747619629,
"sampling/sampling_logp_difference/mean": 0.04201715067028999,
"step": 2,
"step_time": 13.260429134000105
},
{
"clip_ratio/high_max": 0.04315628902986646,
"clip_ratio/high_mean": 0.012242560740560293,
"clip_ratio/low_mean": 0.011964043835178018,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.024206604342907667,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2776.0,
"completions/max_terminated_length": 2776.0,
"completions/mean_length": 1875.09375,
"completions/mean_terminated_length": 1875.09375,
"completions/min_length": 1052.0,
"completions/min_terminated_length": 1052.0,
"entropy": 0.38934508711099625,
"epoch": 0.00024,
"frac_reward_zero_std": 0.25,
"grad_norm": 2.2311601638793945,
"kl": 0.003617420152295381,
"learning_rate": 3.4285714285714286e-07,
"loss": -0.0954,
"num_tokens": 230320.0,
"reward": 0.4612500071525574,
"reward_std": 0.22380851209163666,
"rewards/rollout_reward_func/mean": 0.4612500071525574,
"rewards/rollout_reward_func/std": 0.3984546363353729,
"sampling/importance_sampling_ratio/max": 1.6067352294921875,
"sampling/importance_sampling_ratio/mean": 0.9242645502090454,
"sampling/importance_sampling_ratio/min": 0.17279618978500366,
"sampling/sampling_logp_difference/max": 1.4119317531585693,
"sampling/sampling_logp_difference/mean": 0.045969706028699875,
"step": 3,
"step_time": 12.304947445999915
},
{
"clip_ratio/high_max": 0.023281023371964693,
"clip_ratio/high_mean": 0.012716594734229147,
"clip_ratio/low_mean": 0.01039634458720684,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.023112939670681953,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2783.0,
"completions/max_terminated_length": 2783.0,
"completions/mean_length": 2251.84375,
"completions/mean_terminated_length": 2251.84375,
"completions/min_length": 1562.0,
"completions/min_terminated_length": 1562.0,
"entropy": 0.4188930094242096,
"epoch": 0.00032,
"frac_reward_zero_std": 0.25,
"grad_norm": 2.3432626724243164,
"kl": 0.005323103512637317,
"learning_rate": 5.142857142857143e-07,
"loss": -0.1037,
"num_tokens": 315875.0,
"reward": 0.2640625238418579,
"reward_std": 0.07438889145851135,
"rewards/rollout_reward_func/mean": 0.2640625238418579,
"rewards/rollout_reward_func/std": 0.09810657054185867,
"sampling/importance_sampling_ratio/max": 2.92923903465271,
"sampling/importance_sampling_ratio/mean": 1.0071074962615967,
"sampling/importance_sampling_ratio/min": 0.30356213450431824,
"sampling/sampling_logp_difference/max": 0.9253432750701904,
"sampling/sampling_logp_difference/mean": 0.04933081567287445,
"step": 4,
"step_time": 13.287707804000092
},
{
"clip_ratio/high_max": 0.04239537985995412,
"clip_ratio/high_mean": 0.018673060229048133,
"clip_ratio/low_mean": 0.0042297979816794395,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.022902858443558216,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2790.0,
"completions/max_terminated_length": 2790.0,
"completions/mean_length": 2197.3125,
"completions/mean_terminated_length": 2197.3125,
"completions/min_length": 1559.0,
"completions/min_terminated_length": 1559.0,
"entropy": 0.4414307102560997,
"epoch": 0.0004,
"frac_reward_zero_std": 0.25,
"grad_norm": 2.470518112182617,
"kl": 0.004553150560241193,
"learning_rate": 6.857142857142857e-07,
"loss": 0.1372,
"num_tokens": 399370.0,
"reward": 0.40281248092651367,
"reward_std": 0.16662904620170593,
"rewards/rollout_reward_func/mean": 0.40281248092651367,
"rewards/rollout_reward_func/std": 0.3357921242713928,
"sampling/importance_sampling_ratio/max": 2.2576870918273926,
"sampling/importance_sampling_ratio/mean": 1.0002690553665161,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.462371826171875,
"sampling/sampling_logp_difference/mean": 0.053694289177656174,
"step": 5,
"step_time": 13.068715858000132
},
{
"clip_ratio/high_max": 0.02923969691619277,
"clip_ratio/high_mean": 0.01021690119523555,
"clip_ratio/low_mean": 0.01101089478470385,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.021227796096354723,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2428.0,
"completions/max_terminated_length": 2428.0,
"completions/mean_length": 1826.09375,
"completions/mean_terminated_length": 1826.09375,
"completions/min_length": 1053.0,
"completions/min_terminated_length": 1053.0,
"entropy": 0.37763065844774246,
"epoch": 0.00048,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.4737789630889893,
"kl": 0.003610707528423518,
"learning_rate": 8.571428571428571e-07,
"loss": 0.0212,
"num_tokens": 469858.0,
"reward": 0.4584375023841858,
"reward_std": 0.2892817258834839,
"rewards/rollout_reward_func/mean": 0.4584375023841858,
"rewards/rollout_reward_func/std": 0.4035496413707733,
"sampling/importance_sampling_ratio/max": 1.8672934770584106,
"sampling/importance_sampling_ratio/mean": 0.9250987768173218,
"sampling/importance_sampling_ratio/min": 0.2111542820930481,
"sampling/sampling_logp_difference/max": 1.105020523071289,
"sampling/sampling_logp_difference/mean": 0.04392547905445099,
"step": 6,
"step_time": 11.808095918000163
},
{
"clip_ratio/high_max": 0.02163859363645315,
"clip_ratio/high_mean": 0.007195362821221352,
"clip_ratio/low_mean": 0.009288194705732167,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01648355764336884,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2799.0,
"completions/max_terminated_length": 2799.0,
"completions/mean_length": 2101.9375,
"completions/mean_terminated_length": 2101.9375,
"completions/min_length": 1053.0,
"completions/min_terminated_length": 1053.0,
"entropy": 0.4031049609184265,
"epoch": 0.00056,
"frac_reward_zero_std": 0.375,
"grad_norm": 2.241011142730713,
"kl": 0.004900285159237683,
"learning_rate": 1.0285714285714286e-06,
"loss": 0.0307,
"num_tokens": 549695.0,
"reward": 0.32218751311302185,
"reward_std": 0.10592572391033173,
"rewards/rollout_reward_func/mean": 0.32218751311302185,
"rewards/rollout_reward_func/std": 0.22224271297454834,
"sampling/importance_sampling_ratio/max": 2.7520875930786133,
"sampling/importance_sampling_ratio/mean": 0.9687752723693848,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 0.8389774560928345,
"sampling/sampling_logp_difference/mean": 0.043909620493650436,
"step": 7,
"step_time": 13.085814365000147
},
{
"clip_ratio/high_max": 0.029183519072830677,
"clip_ratio/high_mean": 0.008625667076557875,
"clip_ratio/low_mean": 0.016130636679008603,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02475630398839712,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2794.0,
"completions/max_terminated_length": 2794.0,
"completions/mean_length": 2250.71875,
"completions/mean_terminated_length": 2250.71875,
"completions/min_length": 1570.0,
"completions/min_terminated_length": 1570.0,
"entropy": 0.43513813614845276,
"epoch": 0.00064,
"frac_reward_zero_std": 0.25,
"grad_norm": 2.6571757793426514,
"kl": 0.0038885354879312217,
"learning_rate": 1.2000000000000002e-06,
"loss": -0.0896,
"num_tokens": 634822.0,
"reward": 0.30375000834465027,
"reward_std": 0.11063194274902344,
"rewards/rollout_reward_func/mean": 0.30375000834465027,
"rewards/rollout_reward_func/std": 0.22577106952667236,
"sampling/importance_sampling_ratio/max": 2.24173641204834,
"sampling/importance_sampling_ratio/mean": 0.9777867794036865,
"sampling/importance_sampling_ratio/min": 0.4010058343410492,
"sampling/sampling_logp_difference/max": 0.9179394245147705,
"sampling/sampling_logp_difference/mean": 0.0499531514942646,
"step": 8,
"step_time": 13.055169309000007
},
{
"clip_ratio/high_max": 0.014742525294423103,
"clip_ratio/high_mean": 0.003685631323605776,
"clip_ratio/low_mean": 0.008176195668056607,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.011861827224493027,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2810.0,
"completions/max_terminated_length": 2810.0,
"completions/mean_length": 1657.03125,
"completions/mean_terminated_length": 1657.03125,
"completions/min_length": 1053.0,
"completions/min_terminated_length": 1053.0,
"entropy": 0.39970337599515915,
"epoch": 0.00072,
"frac_reward_zero_std": 0.5,
"grad_norm": 1.901354432106018,
"kl": 0.005712996702641249,
"learning_rate": 1.3714285714285715e-06,
"loss": 0.0004,
"num_tokens": 699616.0,
"reward": 0.3528124690055847,
"reward_std": 0.20240315794944763,
"rewards/rollout_reward_func/mean": 0.3528124690055847,
"rewards/rollout_reward_func/std": 0.3597510755062103,
"sampling/importance_sampling_ratio/max": 2.387613296508789,
"sampling/importance_sampling_ratio/mean": 1.0771517753601074,
"sampling/importance_sampling_ratio/min": 0.5435174703598022,
"sampling/sampling_logp_difference/max": 0.6833771467208862,
"sampling/sampling_logp_difference/mean": 0.04181923717260361,
"step": 9,
"step_time": 13.294292020000057
},
{
"clip_ratio/high_max": 0.03906210558488965,
"clip_ratio/high_mean": 0.015391179244033992,
"clip_ratio/low_mean": 0.0073633925057947636,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.022754571866244078,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2793.0,
"completions/max_terminated_length": 2793.0,
"completions/mean_length": 2214.53125,
"completions/mean_terminated_length": 2214.53125,
"completions/min_length": 1052.0,
"completions/min_terminated_length": 1052.0,
"entropy": 0.41637370735406876,
"epoch": 0.0008,
"frac_reward_zero_std": 0.25,
"grad_norm": 2.541189670562744,
"kl": 0.004748326842673123,
"learning_rate": 1.5428571428571428e-06,
"loss": -0.0168,
"num_tokens": 783707.0,
"reward": 0.4284375309944153,
"reward_std": 0.1260128915309906,
"rewards/rollout_reward_func/mean": 0.4284375309944153,
"rewards/rollout_reward_func/std": 0.3622608780860901,
"sampling/importance_sampling_ratio/max": 2.2261769771575928,
"sampling/importance_sampling_ratio/mean": 1.042180061340332,
"sampling/importance_sampling_ratio/min": 0.2320551723241806,
"sampling/sampling_logp_difference/max": 1.021528959274292,
"sampling/sampling_logp_difference/mean": 0.04730905592441559,
"step": 10,
"step_time": 13.637110585000073
},
{
"clip_ratio/high_max": 0.010990338400006294,
"clip_ratio/high_mean": 0.0027475846000015736,
"clip_ratio/low_mean": 0.0016025641234591603,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004350148723460734,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2793.0,
"completions/max_terminated_length": 2793.0,
"completions/mean_length": 1966.4375,
"completions/mean_terminated_length": 1966.4375,
"completions/min_length": 1055.0,
"completions/min_terminated_length": 1055.0,
"entropy": 0.4171219617128372,
"epoch": 0.00088,
"frac_reward_zero_std": 0.625,
"grad_norm": 1.4207873344421387,
"kl": 0.0035471616429276764,
"learning_rate": 1.7142857142857143e-06,
"loss": 0.0422,
"num_tokens": 858994.0,
"reward": 0.4596875309944153,
"reward_std": 0.14279377460479736,
"rewards/rollout_reward_func/mean": 0.4596875309944153,
"rewards/rollout_reward_func/std": 0.3725802004337311,
"sampling/importance_sampling_ratio/max": 1.874053716659546,
"sampling/importance_sampling_ratio/mean": 0.9027889966964722,
"sampling/importance_sampling_ratio/min": 0.45684853196144104,
"sampling/sampling_logp_difference/max": 0.5253086090087891,
"sampling/sampling_logp_difference/mean": 0.0447448305785656,
"step": 11,
"step_time": 12.90678849699998
},
{
"clip_ratio/high_max": 0.039288708940148354,
"clip_ratio/high_mean": 0.017087680520489812,
"clip_ratio/low_mean": 0.008439590455964208,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02552727097645402,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2782.0,
"completions/max_terminated_length": 2782.0,
"completions/mean_length": 1859.5625,
"completions/mean_terminated_length": 1859.5625,
"completions/min_length": 1053.0,
"completions/min_terminated_length": 1053.0,
"entropy": 0.42384539544582367,
"epoch": 0.00096,
"frac_reward_zero_std": 0.375,
"grad_norm": 2.1793887615203857,
"kl": 0.004719441349152476,
"learning_rate": 1.8857142857142858e-06,
"loss": -0.0501,
"num_tokens": 930647.0,
"reward": 0.5653125047683716,
"reward_std": 0.09132834523916245,
"rewards/rollout_reward_func/mean": 0.5653125047683716,
"rewards/rollout_reward_func/std": 0.4122869372367859,
"sampling/importance_sampling_ratio/max": 1.968488097190857,
"sampling/importance_sampling_ratio/mean": 1.1238960027694702,
"sampling/importance_sampling_ratio/min": 0.5891481637954712,
"sampling/sampling_logp_difference/max": 0.9189000129699707,
"sampling/sampling_logp_difference/mean": 0.045362215489149094,
"step": 12,
"step_time": 12.105040577999944
},
{
"clip_ratio/high_max": 0.03163956617936492,
"clip_ratio/high_mean": 0.009185401839204133,
"clip_ratio/low_mean": 0.01267810445278883,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.021863506408408284,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2794.0,
"completions/max_terminated_length": 2794.0,
"completions/mean_length": 1744.0625,
"completions/mean_terminated_length": 1744.0625,
"completions/min_length": 1053.0,
"completions/min_terminated_length": 1053.0,
"entropy": 0.38077671080827713,
"epoch": 0.00104,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.601933240890503,
"kl": 0.0043588640401139855,
"learning_rate": 2.0571428571428573e-06,
"loss": -0.0271,
"num_tokens": 998268.0,
"reward": 0.5040624737739563,
"reward_std": 0.3648141622543335,
"rewards/rollout_reward_func/mean": 0.5040624737739563,
"rewards/rollout_reward_func/std": 0.4420104920864105,
"sampling/importance_sampling_ratio/max": 2.2825241088867188,
"sampling/importance_sampling_ratio/mean": 1.0028969049453735,
"sampling/importance_sampling_ratio/min": 0.37051475048065186,
"sampling/sampling_logp_difference/max": 0.6929263472557068,
"sampling/sampling_logp_difference/mean": 0.043037254363298416,
"step": 13,
"step_time": 12.434848872999964
},
{
"clip_ratio/high_max": 0.0682385629042983,
"clip_ratio/high_mean": 0.022985405288636684,
"clip_ratio/low_mean": 0.0055555556900799274,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.028540961910039186,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2797.0,
"completions/max_terminated_length": 2797.0,
"completions/mean_length": 2003.40625,
"completions/mean_terminated_length": 2003.40625,
"completions/min_length": 1052.0,
"completions/min_terminated_length": 1052.0,
"entropy": 0.40549617260694504,
"epoch": 0.00112,
"frac_reward_zero_std": 0.125,
"grad_norm": 2.6730706691741943,
"kl": 0.004465080099180341,
"learning_rate": 2.2285714285714286e-06,
"loss": 0.0367,
"num_tokens": 1075200.0,
"reward": 0.3971875011920929,
"reward_std": 0.24656714498996735,
"rewards/rollout_reward_func/mean": 0.3971875011920929,
"rewards/rollout_reward_func/std": 0.3921506702899933,
"sampling/importance_sampling_ratio/max": 2.08994197845459,
"sampling/importance_sampling_ratio/mean": 0.9472236037254333,
"sampling/importance_sampling_ratio/min": 0.2920815646648407,
"sampling/sampling_logp_difference/max": 0.5747750997543335,
"sampling/sampling_logp_difference/mean": 0.0427585169672966,
"step": 14,
"step_time": 13.12732943400033
},
{
"clip_ratio/high_max": 0.04484127042815089,
"clip_ratio/high_mean": 0.01285505446139723,
"clip_ratio/low_mean": 0.008134920848533511,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.020989975426346064,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2801.0,
"completions/max_terminated_length": 2801.0,
"completions/mean_length": 1879.96875,
"completions/mean_terminated_length": 1879.96875,
"completions/min_length": 1052.0,
"completions/min_terminated_length": 1052.0,
"entropy": 0.3704817444086075,
"epoch": 0.0012,
"frac_reward_zero_std": 0.375,
"grad_norm": 1.3741340637207031,
"kl": 0.004930144699756056,
"learning_rate": 2.4000000000000003e-06,
"loss": -0.036,
"num_tokens": 1147654.0,
"reward": 0.5634374618530273,
"reward_std": 0.2032102644443512,
"rewards/rollout_reward_func/mean": 0.5634374618530273,
"rewards/rollout_reward_func/std": 0.4479026794433594,
"sampling/importance_sampling_ratio/max": 1.419919490814209,
"sampling/importance_sampling_ratio/mean": 0.8213506937026978,
"sampling/importance_sampling_ratio/min": 0.2297196239233017,
"sampling/sampling_logp_difference/max": 0.9635820388793945,
"sampling/sampling_logp_difference/mean": 0.04450097680091858,
"step": 15,
"step_time": 12.787183091000088
},
{
"clip_ratio/high_max": 0.040403091348707676,
"clip_ratio/high_mean": 0.019732415094040334,
"clip_ratio/low_mean": 0.011093285749666393,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.030825700610876083,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2812.0,
"completions/max_terminated_length": 2812.0,
"completions/mean_length": 2226.75,
"completions/mean_terminated_length": 2226.75,
"completions/min_length": 1054.0,
"completions/min_terminated_length": 1054.0,
"entropy": 0.4390428438782692,
"epoch": 0.00128,
"frac_reward_zero_std": 0.125,
"grad_norm": 3.3430113792419434,
"kl": 0.005434123100712895,
"learning_rate": 2.571428571428571e-06,
"loss": 0.0608,
"num_tokens": 1232479.0,
"reward": 0.35874998569488525,
"reward_std": 0.16885429620742798,
"rewards/rollout_reward_func/mean": 0.35874998569488525,
"rewards/rollout_reward_func/std": 0.31368517875671387,
"sampling/importance_sampling_ratio/max": 2.1880178451538086,
"sampling/importance_sampling_ratio/mean": 0.9618589878082275,
"sampling/importance_sampling_ratio/min": 0.12961336970329285,
"sampling/sampling_logp_difference/max": 0.941362738609314,
"sampling/sampling_logp_difference/mean": 0.05188923329114914,
"step": 16,
"step_time": 13.173405885999728
},
{
"clip_ratio/high_max": 0.04390919208526611,
"clip_ratio/high_mean": 0.017439239425584674,
"clip_ratio/low_mean": 0.007801226573064923,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.025240465998649597,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2434.0,
"completions/max_terminated_length": 2434.0,
"completions/mean_length": 1750.0625,
"completions/mean_terminated_length": 1750.0625,
"completions/min_length": 1052.0,
"completions/min_terminated_length": 1052.0,
"entropy": 0.3980557546019554,
"epoch": 0.00136,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.602077007293701,
"kl": 0.004334585275501013,
"learning_rate": 2.742857142857143e-06,
"loss": -0.0264,
"num_tokens": 1300635.0,
"reward": 0.38499999046325684,
"reward_std": 0.221183180809021,
"rewards/rollout_reward_func/mean": 0.38499999046325684,
"rewards/rollout_reward_func/std": 0.34839722514152527,
"sampling/importance_sampling_ratio/max": 1.7235372066497803,
"sampling/importance_sampling_ratio/mean": 0.9467421770095825,
"sampling/importance_sampling_ratio/min": 0.2654297649860382,
"sampling/sampling_logp_difference/max": 0.7773740887641907,
"sampling/sampling_logp_difference/mean": 0.04712219163775444,
"step": 17,
"step_time": 11.411276259999795
},
{
"clip_ratio/high_max": 0.04594441968947649,
"clip_ratio/high_mean": 0.013718247646465898,
"clip_ratio/low_mean": 0.004949534311890602,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.018667781492695212,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2798.0,
"completions/max_terminated_length": 2798.0,
"completions/mean_length": 2309.09375,
"completions/mean_terminated_length": 2309.09375,
"completions/min_length": 1055.0,
"completions/min_terminated_length": 1055.0,
"entropy": 0.4307108670473099,
"epoch": 0.00144,
"frac_reward_zero_std": 0.375,
"grad_norm": 1.9505234956741333,
"kl": 0.004669323505368084,
"learning_rate": 2.9142857142857142e-06,
"loss": 0.0981,
"num_tokens": 1388529.0,
"reward": 0.3696874976158142,
"reward_std": 0.155008003115654,
"rewards/rollout_reward_func/mean": 0.3696874976158142,
"rewards/rollout_reward_func/std": 0.28414538502693176,
"sampling/importance_sampling_ratio/max": 1.8336728811264038,
"sampling/importance_sampling_ratio/mean": 0.9352109432220459,
"sampling/importance_sampling_ratio/min": 0.28059616684913635,
"sampling/sampling_logp_difference/max": 1.0694303512573242,
"sampling/sampling_logp_difference/mean": 0.05270082503557205,
"step": 18,
"step_time": 13.463537552000162
},
{
"clip_ratio/high_max": 0.03351574344560504,
"clip_ratio/high_mean": 0.017963151913136244,
"clip_ratio/low_mean": 0.005672972998581827,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.023636124562472105,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2782.0,
"completions/max_terminated_length": 2782.0,
"completions/mean_length": 2203.90625,
"completions/mean_terminated_length": 2203.90625,
"completions/min_length": 1564.0,
"completions/min_terminated_length": 1564.0,
"entropy": 0.4163732975721359,
"epoch": 0.00152,
"frac_reward_zero_std": 0.25,
"grad_norm": 2.54580020904541,
"kl": 0.0039027896127663553,
"learning_rate": 3.0857142857142855e-06,
"loss": -0.0385,
"num_tokens": 1472480.0,
"reward": 0.2887499928474426,
"reward_std": 0.1067335307598114,
"rewards/rollout_reward_func/mean": 0.2887499928474426,
"rewards/rollout_reward_func/std": 0.17496080696582794,
"sampling/importance_sampling_ratio/max": 2.46917724609375,
"sampling/importance_sampling_ratio/mean": 1.0520013570785522,
"sampling/importance_sampling_ratio/min": 0.31319668889045715,
"sampling/sampling_logp_difference/max": 0.6751515865325928,
"sampling/sampling_logp_difference/mean": 0.04795370250940323,
"step": 19,
"step_time": 12.881922476999762
},
{
"clip_ratio/high_max": 0.03289473615586758,
"clip_ratio/high_mean": 0.016854635905474424,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.016854635905474424,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2803.0,
"completions/max_terminated_length": 2803.0,
"completions/mean_length": 1839.5625,
"completions/mean_terminated_length": 1839.5625,
"completions/min_length": 1053.0,
"completions/min_terminated_length": 1053.0,
"entropy": 0.3896697387099266,
"epoch": 0.0016,
"frac_reward_zero_std": 0.5,
"grad_norm": 3.0274949073791504,
"kl": 0.004332752665504813,
"learning_rate": 3.257142857142857e-06,
"loss": 0.1087,
"num_tokens": 1543546.0,
"reward": 0.4571874737739563,
"reward_std": 0.20620864629745483,
"rewards/rollout_reward_func/mean": 0.4571874737739563,
"rewards/rollout_reward_func/std": 0.38446637988090515,
"sampling/importance_sampling_ratio/max": 2.2497854232788086,
"sampling/importance_sampling_ratio/mean": 0.9864073395729065,
"sampling/importance_sampling_ratio/min": 0.3370327055454254,
"sampling/sampling_logp_difference/max": 0.9195313453674316,
"sampling/sampling_logp_difference/mean": 0.04598519578576088,
"step": 20,
"step_time": 13.28698261799991
},
{
"clip_ratio/high_max": 0.047807968221604824,
"clip_ratio/high_mean": 0.017668311716988683,
"clip_ratio/low_mean": 0.0087070451118052,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.026375357527285814,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2784.0,
"completions/max_terminated_length": 2784.0,
"completions/mean_length": 1916.84375,
"completions/mean_terminated_length": 1916.84375,
"completions/min_length": 1052.0,
"completions/min_terminated_length": 1052.0,
"entropy": 0.3857065215706825,
"epoch": 0.00168,
"frac_reward_zero_std": 0.125,
"grad_norm": 2.2928388118743896,
"kl": 0.0030007859459146857,
"learning_rate": 3.4285714285714285e-06,
"loss": 0.0691,
"num_tokens": 1617299.0,
"reward": 0.5199999809265137,
"reward_std": 0.24810142815113068,
"rewards/rollout_reward_func/mean": 0.5199999809265137,
"rewards/rollout_reward_func/std": 0.44394853711128235,
"sampling/importance_sampling_ratio/max": 1.9692103862762451,
"sampling/importance_sampling_ratio/mean": 1.0206944942474365,
"sampling/importance_sampling_ratio/min": 0.37676262855529785,
"sampling/sampling_logp_difference/max": 0.5263509750366211,
"sampling/sampling_logp_difference/mean": 0.04122690111398697,
"step": 21,
"step_time": 12.188486421000334
},
{
"clip_ratio/high_max": 0.040458154398947954,
"clip_ratio/high_mean": 0.011364538804627955,
"clip_ratio/low_mean": 0.006526540499180555,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.017891079653054476,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2801.0,
"completions/max_terminated_length": 2801.0,
"completions/mean_length": 1878.4375,
"completions/mean_terminated_length": 1878.4375,
"completions/min_length": 1052.0,
"completions/min_terminated_length": 1052.0,
"entropy": 0.3931129276752472,
"epoch": 0.00176,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.7913806438446045,
"kl": 0.004387594643048942,
"learning_rate": 3.6e-06,
"loss": -0.0657,
"num_tokens": 1690165.0,
"reward": 0.48281246423721313,
"reward_std": 0.24489575624465942,
"rewards/rollout_reward_func/mean": 0.48281246423721313,
"rewards/rollout_reward_func/std": 0.42833685874938965,
"sampling/importance_sampling_ratio/max": 2.001044750213623,
"sampling/importance_sampling_ratio/mean": 0.8716533780097961,
"sampling/importance_sampling_ratio/min": 0.21946659684181213,
"sampling/sampling_logp_difference/max": 0.6549723148345947,
"sampling/sampling_logp_difference/mean": 0.04290828853845596,
"step": 22,
"step_time": 12.900562208999872
},
{
"clip_ratio/high_max": 0.012908496893942356,
"clip_ratio/high_mean": 0.003227124223485589,
"clip_ratio/low_mean": 0.0013888889225199819,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004616013146005571,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2795.0,
"completions/max_terminated_length": 2795.0,
"completions/mean_length": 1921.96875,
"completions/mean_terminated_length": 1921.96875,
"completions/min_length": 1053.0,
"completions/min_terminated_length": 1053.0,
"entropy": 0.3991682603955269,
"epoch": 0.00184,
"frac_reward_zero_std": 0.375,
"grad_norm": 2.757072687149048,
"kl": 0.002871026110369712,
"learning_rate": 3.7714285714285716e-06,
"loss": -0.0322,
"num_tokens": 1764351.0,
"reward": 0.4725000262260437,
"reward_std": 0.15098075568675995,
"rewards/rollout_reward_func/mean": 0.4725000262260437,
"rewards/rollout_reward_func/std": 0.3937331438064575,
"sampling/importance_sampling_ratio/max": 2.473691463470459,
"sampling/importance_sampling_ratio/mean": 1.0277502536773682,
"sampling/importance_sampling_ratio/min": 0.3683130145072937,
"sampling/sampling_logp_difference/max": 0.8325839042663574,
"sampling/sampling_logp_difference/mean": 0.041013769805431366,
"step": 23,
"step_time": 12.923486550999996
},
{
"clip_ratio/high_max": 0.022044573910534382,
"clip_ratio/high_mean": 0.0072472544852644205,
"clip_ratio/low_mean": 0.007787698996253312,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015034952783025801,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2791.0,
"completions/max_terminated_length": 2791.0,
"completions/mean_length": 2014.1875,
"completions/mean_terminated_length": 2014.1875,
"completions/min_length": 1054.0,
"completions/min_terminated_length": 1054.0,
"entropy": 0.39251676946878433,
"epoch": 0.00192,
"frac_reward_zero_std": 0.375,
"grad_norm": 1.9654884338378906,
"kl": 0.0081728242803365,
"learning_rate": 3.942857142857143e-06,
"loss": -0.0383,
"num_tokens": 1841628.0,
"reward": 0.35874998569488525,
"reward_std": 0.21719886362552643,
"rewards/rollout_reward_func/mean": 0.35874998569488525,
"rewards/rollout_reward_func/std": 0.31252095103263855,
"sampling/importance_sampling_ratio/max": 2.0834484100341797,
"sampling/importance_sampling_ratio/mean": 0.9893499612808228,
"sampling/importance_sampling_ratio/min": 0.06596492230892181,
"sampling/sampling_logp_difference/max": 1.764291524887085,
"sampling/sampling_logp_difference/mean": 0.05037356913089752,
"step": 24,
"step_time": 12.55188547500029
},
{
"clip_ratio/high_max": 0.03194444486871362,
"clip_ratio/high_mean": 0.009474206599406898,
"clip_ratio/low_mean": 0.004620927385985851,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.014095134101808071,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2432.0,
"completions/max_terminated_length": 2432.0,
"completions/mean_length": 1997.46875,
"completions/mean_terminated_length": 1997.46875,
"completions/min_length": 1052.0,
"completions/min_terminated_length": 1052.0,
"entropy": 0.40253835916519165,
"epoch": 0.002,
"frac_reward_zero_std": 0.5,
"grad_norm": 2.1649582386016846,
"kl": 0.007934511464554816,
"learning_rate": 4.114285714285715e-06,
"loss": -0.084,
"num_tokens": 1918276.0,
"reward": 0.3425000011920929,
"reward_std": 0.16030071675777435,
"rewards/rollout_reward_func/mean": 0.3425000011920929,
"rewards/rollout_reward_func/std": 0.27845191955566406,
"sampling/importance_sampling_ratio/max": 1.7379083633422852,
"sampling/importance_sampling_ratio/mean": 1.0123233795166016,
"sampling/importance_sampling_ratio/min": 0.21978217363357544,
"sampling/sampling_logp_difference/max": 0.9820888042449951,
"sampling/sampling_logp_difference/mean": 0.043975915759801865,
"step": 25,
"step_time": 11.6416912709999
},
{
"clip_ratio/high_max": 0.057189542800188065,
"clip_ratio/high_mean": 0.02329625654965639,
"clip_ratio/low_mean": 0.008795286994427443,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03209154261276126,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2812.0,
"completions/max_terminated_length": 2812.0,
"completions/mean_length": 2010.0625,
"completions/mean_terminated_length": 2010.0625,
"completions/min_length": 1053.0,
"completions/min_terminated_length": 1053.0,
"entropy": 0.3653796315193176,
"epoch": 0.00208,
"frac_reward_zero_std": 0.125,
"grad_norm": 2.1230344772338867,
"kl": 0.006589570315554738,
"learning_rate": 4.285714285714286e-06,
"loss": -0.0197,
"num_tokens": 1995372.0,
"reward": 0.4256249964237213,
"reward_std": 0.23703671991825104,
"rewards/rollout_reward_func/mean": 0.4256249964237213,
"rewards/rollout_reward_func/std": 0.3602412939071655,
"sampling/importance_sampling_ratio/max": 1.7632914781570435,
"sampling/importance_sampling_ratio/mean": 0.9213794469833374,
"sampling/importance_sampling_ratio/min": 0.4378761649131775,
"sampling/sampling_logp_difference/max": 0.56688392162323,
"sampling/sampling_logp_difference/mean": 0.03944293037056923,
"step": 26,
"step_time": 13.15109654299954
},
{
"clip_ratio/high_max": 0.04949874710291624,
"clip_ratio/high_mean": 0.02149919094517827,
"clip_ratio/low_mean": 0.00766741088591516,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02916660183109343,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2785.0,
"completions/max_terminated_length": 2785.0,
"completions/mean_length": 1842.0625,
"completions/mean_terminated_length": 1842.0625,
"completions/min_length": 1053.0,
"completions/min_terminated_length": 1053.0,
"entropy": 0.4269302785396576,
"epoch": 0.00216,
"frac_reward_zero_std": 0.25,
"grad_norm": 1.6013935804367065,
"kl": 0.00617267657071352,
"learning_rate": 4.457142857142857e-06,
"loss": -0.0345,
"num_tokens": 2066465.0,
"reward": 0.5221875309944153,
"reward_std": 0.22779378294944763,
"rewards/rollout_reward_func/mean": 0.5221875309944153,
"rewards/rollout_reward_func/std": 0.4334239661693573,
"sampling/importance_sampling_ratio/max": 2.312187433242798,
"sampling/importance_sampling_ratio/mean": 0.8621585369110107,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 0.9948511123657227,
"sampling/sampling_logp_difference/mean": 0.051924653351306915,
"step": 27,
"step_time": 12.681567872999949
},
{
"clip_ratio/high_max": 0.04371212236583233,
"clip_ratio/high_mean": 0.0183574166148901,
"clip_ratio/low_mean": 0.005908275721594691,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.024265691870823503,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2796.0,
"completions/max_terminated_length": 2796.0,
"completions/mean_length": 2155.5,
"completions/mean_terminated_length": 2155.5,
"completions/min_length": 1053.0,
"completions/min_terminated_length": 1053.0,
"entropy": 0.41429970413446426,
"epoch": 0.00224,
"frac_reward_zero_std": 0.25,
"grad_norm": 2.647275447845459,
"kl": 0.010079714236781001,
"learning_rate": 4.628571428571429e-06,
"loss": -0.0864,
"num_tokens": 2148817.0,
"reward": 0.3021875023841858,
"reward_std": 0.11279378086328506,
"rewards/rollout_reward_func/mean": 0.3021875023841858,
"rewards/rollout_reward_func/std": 0.23064753413200378,
"sampling/importance_sampling_ratio/max": 2.1843345165252686,
"sampling/importance_sampling_ratio/mean": 0.9328470230102539,
"sampling/importance_sampling_ratio/min": 0.11585874110460281,
"sampling/sampling_logp_difference/max": 1.9821176528930664,
"sampling/sampling_logp_difference/mean": 0.05276907980442047,
"step": 28,
"step_time": 12.799536062000016
},
{
"clip_ratio/high_max": 0.039141415152698755,
"clip_ratio/high_mean": 0.019034530967473984,
"clip_ratio/low_mean": 0.005208333372138441,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02424286410678178,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2411.0,
"completions/max_terminated_length": 2411.0,
"completions/mean_length": 1544.21875,
"completions/mean_terminated_length": 1544.21875,
"completions/min_length": 1052.0,
"completions/min_terminated_length": 1052.0,
"entropy": 0.38873114436864853,
"epoch": 0.00232,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.288419485092163,
"kl": 0.008441059850156307,
"learning_rate": 4.800000000000001e-06,
"loss": -0.0294,
"num_tokens": 2209518.0,
"reward": 0.5049999952316284,
"reward_std": 0.367961049079895,
"rewards/rollout_reward_func/mean": 0.5049999952316284,
"rewards/rollout_reward_func/std": 0.4586867392063141,
"sampling/importance_sampling_ratio/max": 1.7176055908203125,
"sampling/importance_sampling_ratio/mean": 0.8919655084609985,
"sampling/importance_sampling_ratio/min": 0.3174732029438019,
"sampling/sampling_logp_difference/max": 1.007685899734497,
"sampling/sampling_logp_difference/mean": 0.043198756873607635,
"step": 29,
"step_time": 11.569315259000177
},
{
"clip_ratio/high_max": 0.03119284799322486,
"clip_ratio/high_mean": 0.009251700364984572,
"clip_ratio/low_mean": 0.0032051282469183207,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.012456828728318214,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2776.0,
"completions/max_terminated_length": 2776.0,
"completions/mean_length": 1695.40625,
"completions/mean_terminated_length": 1695.40625,
"completions/min_length": 1052.0,
"completions/min_terminated_length": 1052.0,
"entropy": 0.38929111510515213,
"epoch": 0.0024,
"frac_reward_zero_std": 0.25,
"grad_norm": 1.749756932258606,
"kl": 0.01017191493883729,
"learning_rate": 4.9714285714285715e-06,
"loss": 0.0146,
"num_tokens": 2275561.0,
"reward": 0.5309374928474426,
"reward_std": 0.32216140627861023,
"rewards/rollout_reward_func/mean": 0.5309374928474426,
"rewards/rollout_reward_func/std": 0.4390852451324463,
"sampling/importance_sampling_ratio/max": 2.9540531635284424,
"sampling/importance_sampling_ratio/mean": 1.0208276510238647,
"sampling/importance_sampling_ratio/min": 0.37041175365448,
"sampling/sampling_logp_difference/max": 0.5885751247406006,
"sampling/sampling_logp_difference/mean": 0.04683335870504379,
"step": 30,
"step_time": 12.191692169999897
},
{
"clip_ratio/high_max": 0.05563905602321029,
"clip_ratio/high_mean": 0.01747169380541891,
"clip_ratio/low_mean": 0.008184524020180106,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02565621805842966,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2783.0,
"completions/max_terminated_length": 2783.0,
"completions/mean_length": 1801.09375,
"completions/mean_terminated_length": 1801.09375,
"completions/min_length": 1053.0,
"completions/min_terminated_length": 1053.0,
"entropy": 0.3914438411593437,
"epoch": 0.00248,
"frac_reward_zero_std": 0.125,
"grad_norm": 2.8585875034332275,
"kl": 0.015274998731911182,
"learning_rate": 5.142857142857142e-06,
"loss": 0.0419,
"num_tokens": 2345322.0,
"reward": 0.36281251907348633,
"reward_std": 0.2801453769207001,
"rewards/rollout_reward_func/mean": 0.36281251907348633,
"rewards/rollout_reward_func/std": 0.342911958694458,
"sampling/importance_sampling_ratio/max": 2.163181781768799,
"sampling/importance_sampling_ratio/mean": 0.9487945437431335,
"sampling/importance_sampling_ratio/min": 0.29707521200180054,
"sampling/sampling_logp_difference/max": 0.7824678421020508,
"sampling/sampling_logp_difference/mean": 0.0532098188996315,
"step": 31,
"step_time": 13.19187305000014
},
{
"clip_ratio/high_max": 0.03187447274103761,
"clip_ratio/high_mean": 0.018647319404408336,
"clip_ratio/low_mean": 0.004727297928184271,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02337461756542325,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2435.0,
"completions/max_terminated_length": 2435.0,
"completions/mean_length": 1984.90625,
"completions/mean_terminated_length": 1984.90625,
"completions/min_length": 1052.0,
"completions/min_terminated_length": 1052.0,
"entropy": 0.416415698826313,
"epoch": 0.00256,
"frac_reward_zero_std": 0.25,
"grad_norm": 2.3030495643615723,
"kl": 0.015865659108385444,
"learning_rate": 5.314285714285714e-06,
"loss": -0.0567,
"num_tokens": 2421421.0,
"reward": 0.3878124952316284,
"reward_std": 0.23157384991645813,
"rewards/rollout_reward_func/mean": 0.3878124952316284,
"rewards/rollout_reward_func/std": 0.3412286341190338,
"sampling/importance_sampling_ratio/max": 2.5926010608673096,
"sampling/importance_sampling_ratio/mean": 0.9760158658027649,
"sampling/importance_sampling_ratio/min": 0.2061164528131485,
"sampling/sampling_logp_difference/max": 0.8063008785247803,
"sampling/sampling_logp_difference/mean": 0.04909588024020195,
"step": 32,
"step_time": 11.466194520999807
},
{
"clip_ratio/high_max": 0.019717262126505375,
"clip_ratio/high_mean": 0.004929315531626344,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004929315531626344,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2809.0,
"completions/max_terminated_length": 2809.0,
"completions/mean_length": 2102.71875,
"completions/mean_terminated_length": 2102.71875,
"completions/min_length": 1054.0,
"completions/min_terminated_length": 1054.0,
"entropy": 0.42558059841394424,
"epoch": 0.00264,
"frac_reward_zero_std": 0.625,
"grad_norm": 1.5914040803909302,
"kl": 0.010543531039729714,
"learning_rate": 5.485714285714286e-06,
"loss": 0.0448,
"num_tokens": 2501867.0,
"reward": 0.5221875309944153,
"reward_std": 0.14279377460479736,
"rewards/rollout_reward_func/mean": 0.5221875309944153,
"rewards/rollout_reward_func/std": 0.4007873833179474,
"sampling/importance_sampling_ratio/max": 1.5994207859039307,
"sampling/importance_sampling_ratio/mean": 0.8397550582885742,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 0.9267706871032715,
"sampling/sampling_logp_difference/mean": 0.0471554696559906,
"step": 33,
"step_time": 12.975996798000097
},
{
"clip_ratio/high_max": 0.040178571827709675,
"clip_ratio/high_mean": 0.016144166933372617,
"clip_ratio/low_mean": 0.005662594106979668,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.021806761040352285,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2441.0,
"completions/max_terminated_length": 2441.0,
"completions/mean_length": 1488.4375,
"completions/mean_terminated_length": 1488.4375,
"completions/min_length": 1052.0,
"completions/min_terminated_length": 1052.0,
"entropy": 0.35695891827344894,
"epoch": 0.00272,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.6733559370040894,
"kl": 0.020034206565469503,
"learning_rate": 5.6571428571428576e-06,
"loss": -0.0588,
"num_tokens": 2560884.0,
"reward": 0.5859375,
"reward_std": 0.38607701659202576,
"rewards/rollout_reward_func/mean": 0.5859375,
"rewards/rollout_reward_func/std": 0.45654281973838806,
"sampling/importance_sampling_ratio/max": 1.8220971822738647,
"sampling/importance_sampling_ratio/mean": 0.9860107898712158,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 0.9601047039031982,
"sampling/sampling_logp_difference/mean": 0.052328821271657944,
"step": 34,
"step_time": 10.76481853400037
},
{
"clip_ratio/high_max": 0.00657894741743803,
"clip_ratio/high_mean": 0.003289473708719015,
"clip_ratio/low_mean": 0.008878070977516472,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.012167544686235487,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2801.0,
"completions/max_terminated_length": 2801.0,
"completions/mean_length": 1756.46875,
"completions/mean_terminated_length": 1756.46875,
"completions/min_length": 1052.0,
"completions/min_terminated_length": 1052.0,
"entropy": 0.38564804941415787,
"epoch": 0.0028,
"frac_reward_zero_std": 0.5,
"grad_norm": 1.5950710773468018,
"kl": 0.0196278584189713,
"learning_rate": 5.8285714285714284e-06,
"loss": 0.0794,
"num_tokens": 2629098.0,
"reward": 0.4750000238418579,
"reward_std": 0.26933756470680237,
"rewards/rollout_reward_func/mean": 0.4750000238418579,
"rewards/rollout_reward_func/std": 0.40420371294021606,
"sampling/importance_sampling_ratio/max": 2.8944315910339355,
"sampling/importance_sampling_ratio/mean": 1.212613582611084,
"sampling/importance_sampling_ratio/min": 0.3920697867870331,
"sampling/sampling_logp_difference/max": 0.7614344358444214,
"sampling/sampling_logp_difference/mean": 0.050811417400836945,
"step": 35,
"step_time": 12.117880444999855
},
{
"clip_ratio/high_max": 0.032855731435120106,
"clip_ratio/high_mean": 0.008213932858780026,
"clip_ratio/low_mean": 0.008068988332524896,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.016282920725643635,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2819.0,
"completions/max_terminated_length": 2819.0,
"completions/mean_length": 2214.375,
"completions/mean_terminated_length": 2214.375,
"completions/min_length": 1579.0,
"completions/min_terminated_length": 1579.0,
"entropy": 0.4132639244198799,
"epoch": 0.00288,
"frac_reward_zero_std": 0.5,
"grad_norm": 1.4248710870742798,
"kl": 0.04949819762259722,
"learning_rate": 6e-06,
"loss": -0.1152,
"num_tokens": 2713433.0,
"reward": 0.3043749928474426,
"reward_std": 0.08011817932128906,
"rewards/rollout_reward_func/mean": 0.3043749928474426,
"rewards/rollout_reward_func/std": 0.16871310770511627,
"sampling/importance_sampling_ratio/max": 2.279515504837036,
"sampling/importance_sampling_ratio/mean": 1.0208816528320312,
"sampling/importance_sampling_ratio/min": 0.2197788804769516,
"sampling/sampling_logp_difference/max": 1.5309280157089233,
"sampling/sampling_logp_difference/mean": 0.05491582304239273,
"step": 36,
"step_time": 13.165009270000155
},
{
"clip_ratio/high_max": 0.01785714365541935,
"clip_ratio/high_mean": 0.004464285913854837,
"clip_ratio/low_mean": 0.0022321429569274187,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006696428870782256,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2803.0,
"completions/max_terminated_length": 2803.0,
"completions/mean_length": 1736.1875,
"completions/mean_terminated_length": 1736.1875,
"completions/min_length": 1052.0,
"completions/min_terminated_length": 1052.0,
"entropy": 0.3515569269657135,
"epoch": 0.00296,
"frac_reward_zero_std": 0.5,
"grad_norm": 1.0670298337936401,
"kl": 0.025617226026952267,
"learning_rate": 5.999999982184864e-06,
"loss": 0.0221,
"num_tokens": 2780777.0,
"reward": 0.4387500286102295,
"reward_std": 0.25966876745224,
"rewards/rollout_reward_func/mean": 0.4387500286102295,
"rewards/rollout_reward_func/std": 0.3832606077194214,
"sampling/importance_sampling_ratio/max": 2.3271644115448,
"sampling/importance_sampling_ratio/mean": 1.0649113655090332,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.0678925514221191,
"sampling/sampling_logp_difference/mean": 0.05666026473045349,
"step": 37,
"step_time": 12.593250806000015
},
{
"clip_ratio/high_max": 0.028383397962898016,
"clip_ratio/high_mean": 0.010161041049286723,
"clip_ratio/low_mean": 0.006483843666501343,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.016644884599372745,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2777.0,
"completions/max_terminated_length": 2777.0,
"completions/mean_length": 1819.5625,
"completions/mean_terminated_length": 1819.5625,
"completions/min_length": 1056.0,
"completions/min_terminated_length": 1056.0,
"entropy": 0.38034912198781967,
"epoch": 0.00304,
"frac_reward_zero_std": 0.25,
"grad_norm": 2.0448880195617676,
"kl": 0.04296189732849598,
"learning_rate": 5.999999928739459e-06,
"loss": -0.0115,
"num_tokens": 2851032.0,
"reward": 0.6024999618530273,
"reward_std": 0.2617889940738678,
"rewards/rollout_reward_func/mean": 0.6024999618530273,
"rewards/rollout_reward_func/std": 0.44098126888275146,
"sampling/importance_sampling_ratio/max": 2.681164503097534,
"sampling/importance_sampling_ratio/mean": 1.0418896675109863,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.4294462203979492,
"sampling/sampling_logp_difference/mean": 0.0609976202249527,
"step": 38,
"step_time": 12.55964067500031
},
{
"clip_ratio/high_max": 0.047167123295366764,
"clip_ratio/high_mean": 0.014736625598743558,
"clip_ratio/low_mean": 0.004429678898304701,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01916630449704826,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2820.0,
"completions/max_terminated_length": 2820.0,
"completions/mean_length": 2000.0,
"completions/mean_terminated_length": 2000.0,
"completions/min_length": 1055.0,
"completions/min_terminated_length": 1055.0,
"entropy": 0.4035666435956955,
"epoch": 0.00312,
"frac_reward_zero_std": 0.25,
"grad_norm": 1.904247760772705,
"kl": 0.03608058113604784,
"learning_rate": 5.999999839663784e-06,
"loss": -0.1975,
"num_tokens": 2927712.0,
"reward": 0.3853124976158142,
"reward_std": 0.1657649129629135,
"rewards/rollout_reward_func/mean": 0.3853124976158142,
"rewards/rollout_reward_func/std": 0.31012988090515137,
"sampling/importance_sampling_ratio/max": 2.3516104221343994,
"sampling/importance_sampling_ratio/mean": 0.8599222898483276,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.4187037944793701,
"sampling/sampling_logp_difference/mean": 0.05978023633360863,
"step": 39,
"step_time": 12.440508590000036
},
{
"clip_ratio/high_max": 0.04069459065794945,
"clip_ratio/high_mean": 0.017941734986379743,
"clip_ratio/low_mean": 0.0016447368543595076,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01958647184073925,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2423.0,
"completions/max_terminated_length": 2423.0,
"completions/mean_length": 1889.0625,
"completions/mean_terminated_length": 1889.0625,
"completions/min_length": 1054.0,
"completions/min_terminated_length": 1054.0,
"entropy": 0.42887038737535477,
"epoch": 0.0032,
"frac_reward_zero_std": 0.25,
"grad_norm": 2.507852077484131,
"kl": 0.031137569807469845,
"learning_rate": 5.99999971495784e-06,
"loss": -0.0375,
"num_tokens": 3000212.0,
"reward": 0.38593751192092896,
"reward_std": 0.16842570900917053,
"rewards/rollout_reward_func/mean": 0.38593751192092896,
"rewards/rollout_reward_func/std": 0.35313212871551514,
"sampling/importance_sampling_ratio/max": 1.8619109392166138,
"sampling/importance_sampling_ratio/mean": 0.8876512050628662,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 0.8854889869689941,
"sampling/sampling_logp_difference/mean": 0.0671561062335968,
"step": 40,
"step_time": 11.693177195999851
},
{
"clip_ratio/high_max": 0.02651259582489729,
"clip_ratio/high_mean": 0.006628148956224322,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00836426008027047,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2789.0,
"completions/max_terminated_length": 2789.0,
"completions/mean_length": 2136.03125,
"completions/mean_terminated_length": 2136.03125,
"completions/min_length": 1053.0,
"completions/min_terminated_length": 1053.0,
"entropy": 0.42095063626766205,
"epoch": 0.00328,
"frac_reward_zero_std": 0.625,
"grad_norm": 1.2850134372711182,
"kl": 0.039208856876939535,
"learning_rate": 5.99999955462163e-06,
"loss": -0.0237,
"num_tokens": 3081651.0,
"reward": 0.3506249785423279,
"reward_std": 0.1440507173538208,
"rewards/rollout_reward_func/mean": 0.3506249785423279,
"rewards/rollout_reward_func/std": 0.2683153748512268,
"sampling/importance_sampling_ratio/max": 2.8166987895965576,
"sampling/importance_sampling_ratio/mean": 1.0108704566955566,
"sampling/importance_sampling_ratio/min": 0.14420194923877716,
"sampling/sampling_logp_difference/max": 1.127936840057373,
"sampling/sampling_logp_difference/mean": 0.06519916653633118,
"step": 41,
"step_time": 14.135176596000292
},
{
"clip_ratio/high_max": 0.03996024373918772,
"clip_ratio/high_mean": 0.012911256635561585,
"clip_ratio/low_mean": 0.004817708395421505,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01772896503098309,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2767.0,
"completions/max_terminated_length": 2767.0,
"completions/mean_length": 1934.65625,
"completions/mean_terminated_length": 1934.65625,
"completions/min_length": 1055.0,
"completions/min_terminated_length": 1055.0,
"entropy": 0.38335342705249786,
"epoch": 0.00336,
"frac_reward_zero_std": 0.25,
"grad_norm": 2.1722676753997803,
"kl": 0.13585597835481167,
"learning_rate": 5.999999358655157e-06,
"loss": -0.2418,
"num_tokens": 3156023.0,
"reward": 0.3475000262260437,
"reward_std": 0.21655070781707764,
"rewards/rollout_reward_func/mean": 0.3475000262260437,
"rewards/rollout_reward_func/std": 0.3131937086582184,
"sampling/importance_sampling_ratio/max": 2.6130497455596924,
"sampling/importance_sampling_ratio/mean": 0.8806287050247192,
"sampling/importance_sampling_ratio/min": 0.16678351163864136,
"sampling/sampling_logp_difference/max": 2.3499860763549805,
"sampling/sampling_logp_difference/mean": 0.06342820823192596,
"step": 42,
"step_time": 13.112628190999885
},
{
"clip_ratio/high_max": 0.021321472711861134,
"clip_ratio/high_mean": 0.007562511134892702,
"clip_ratio/low_mean": 0.0038768798112869263,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.011439391179010272,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2797.0,
"completions/max_terminated_length": 2797.0,
"completions/mean_length": 1674.3125,
"completions/mean_terminated_length": 1674.3125,
"completions/min_length": 1054.0,
"completions/min_terminated_length": 1054.0,
"entropy": 0.3885280713438988,
"epoch": 0.00344,
"frac_reward_zero_std": 0.5,
"grad_norm": 1.4752204418182373,
"kl": 0.036413189955055714,
"learning_rate": 5.999999127058423e-06,
"loss": 0.0258,
"num_tokens": 3221611.0,
"reward": 0.6737500429153442,
"reward_std": 0.25966876745224,
"rewards/rollout_reward_func/mean": 0.6737500429153442,
"rewards/rollout_reward_func/std": 0.4556862711906433,
"sampling/importance_sampling_ratio/max": 2.9477226734161377,
"sampling/importance_sampling_ratio/mean": 1.1396255493164062,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.167872667312622,
"sampling/sampling_logp_difference/mean": 0.06658157706260681,
"step": 43,
"step_time": 12.020586962000152
},
{
"clip_ratio/high_max": 0.036011905409395695,
"clip_ratio/high_mean": 0.010423430823720992,
"clip_ratio/low_mean": 0.0030159883899614215,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.013439419795759022,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2814.0,
"completions/max_terminated_length": 2814.0,
"completions/mean_length": 2095.75,
"completions/mean_terminated_length": 2095.75,
"completions/min_length": 1568.0,
"completions/min_terminated_length": 1568.0,
"entropy": 0.39560940861701965,
"epoch": 0.00352,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.8694807291030884,
"kl": 0.1402588039636612,
"learning_rate": 5.999998859831431e-06,
"loss": -0.1597,
"num_tokens": 3301324.0,
"reward": 0.40437501668930054,
"reward_std": 0.2259407639503479,
"rewards/rollout_reward_func/mean": 0.40437501668930054,
"rewards/rollout_reward_func/std": 0.35422733426094055,
"sampling/importance_sampling_ratio/max": 2.6974401473999023,
"sampling/importance_sampling_ratio/mean": 0.8676252365112305,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 2.911269187927246,
"sampling/sampling_logp_difference/mean": 0.08191373944282532,
"step": 44,
"step_time": 12.868387047999704
},
{
"clip_ratio/high_max": 0.0369886364787817,
"clip_ratio/high_mean": 0.011032873298972845,
"clip_ratio/low_mean": 0.0043535883305594325,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015386461513116956,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2793.0,
"completions/max_terminated_length": 2793.0,
"completions/mean_length": 2412.75,
"completions/mean_terminated_length": 2412.75,
"completions/min_length": 1056.0,
"completions/min_terminated_length": 1056.0,
"entropy": 0.4342958629131317,
"epoch": 0.0036,
"frac_reward_zero_std": 0.375,
"grad_norm": 1.422937035560608,
"kl": 0.11194289568811655,
"learning_rate": 5.999998556974188e-06,
"loss": -0.1586,
"num_tokens": 3392626.0,
"reward": 0.35750001668930054,
"reward_std": 0.0949999988079071,
"rewards/rollout_reward_func/mean": 0.35750001668930054,
"rewards/rollout_reward_func/std": 0.260532945394516,
"sampling/importance_sampling_ratio/max": 2.1776068210601807,
"sampling/importance_sampling_ratio/mean": 0.852668285369873,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 3.2368037700653076,
"sampling/sampling_logp_difference/mean": 0.07167594134807587,
"step": 45,
"step_time": 13.40532306199998
},
{
"clip_ratio/high_max": 0.036038962192833424,
"clip_ratio/high_mean": 0.012058520689606667,
"clip_ratio/low_mean": 0.0017857142956927419,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.013844234868884087,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2784.0,
"completions/max_terminated_length": 2784.0,
"completions/mean_length": 2018.40625,
"completions/mean_terminated_length": 2018.40625,
"completions/min_length": 1053.0,
"completions/min_terminated_length": 1053.0,
"entropy": 0.3769753500819206,
"epoch": 0.00368,
"frac_reward_zero_std": 0.5,
"grad_norm": 6.942874908447266,
"kl": 0.8322499115020037,
"learning_rate": 5.999998218486697e-06,
"loss": -0.0692,
"num_tokens": 3469989.0,
"reward": 0.39250001311302185,
"reward_std": 0.14825798571109772,
"rewards/rollout_reward_func/mean": 0.39250001311302185,
"rewards/rollout_reward_func/std": 0.29918164014816284,
"sampling/importance_sampling_ratio/max": 2.446554660797119,
"sampling/importance_sampling_ratio/mean": 0.8061342239379883,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 3.015519142150879,
"sampling/sampling_logp_difference/mean": 0.07696790993213654,
"step": 46,
"step_time": 12.274703390000013
},
{
"clip_ratio/high_max": 0.04237867519259453,
"clip_ratio/high_mean": 0.01807057624682784,
"clip_ratio/low_mean": 0.005178963067010045,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.023249539081007242,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2800.0,
"completions/max_terminated_length": 2800.0,
"completions/mean_length": 1802.75,
"completions/mean_terminated_length": 1802.75,
"completions/min_length": 1053.0,
"completions/min_terminated_length": 1053.0,
"entropy": 0.38206612318754196,
"epoch": 0.00376,
"frac_reward_zero_std": 0.375,
"grad_norm": 1.6050693988800049,
"kl": 0.05531273875385523,
"learning_rate": 5.999997844368963e-06,
"loss": -0.0113,
"num_tokens": 3540097.0,
"reward": 0.4990624785423279,
"reward_std": 0.28371256589889526,
"rewards/rollout_reward_func/mean": 0.4990624785423279,
"rewards/rollout_reward_func/std": 0.41065138578414917,
"sampling/importance_sampling_ratio/max": 1.9599696397781372,
"sampling/importance_sampling_ratio/mean": 0.8884379863739014,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 0.8157303333282471,
"sampling/sampling_logp_difference/mean": 0.06130218505859375,
"step": 47,
"step_time": 12.33328224800016
},
{
"clip_ratio/high_max": 0.0206808946095407,
"clip_ratio/high_mean": 0.005170223652385175,
"clip_ratio/low_mean": 0.004861111170612276,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01003133482299745,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2813.0,
"completions/max_terminated_length": 2813.0,
"completions/mean_length": 1976.21875,
"completions/mean_terminated_length": 1976.21875,
"completions/min_length": 1054.0,
"completions/min_terminated_length": 1054.0,
"entropy": 0.37976498901844025,
"epoch": 0.00384,
"frac_reward_zero_std": 0.5,
"grad_norm": 1.5906230211257935,
"kl": 0.11688470654189587,
"learning_rate": 5.999997434620992e-06,
"loss": -0.1357,
"num_tokens": 3616089.0,
"reward": 0.437812477350235,
"reward_std": 0.20705953240394592,
"rewards/rollout_reward_func/mean": 0.437812477350235,
"rewards/rollout_reward_func/std": 0.35220715403556824,
"sampling/importance_sampling_ratio/max": 1.8663876056671143,
"sampling/importance_sampling_ratio/mean": 0.8626433610916138,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 2.65832781791687,
"sampling/sampling_logp_difference/mean": 0.06971758604049683,
"step": 48,
"step_time": 12.541744299999891
},
{
"clip_ratio/high_max": 0.012820512987673283,
"clip_ratio/high_mean": 0.0032051282469183207,
"clip_ratio/low_mean": 0.0014534883666783571,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004658616613596678,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2807.0,
"completions/max_terminated_length": 2807.0,
"completions/mean_length": 2245.15625,
"completions/mean_terminated_length": 2245.15625,
"completions/min_length": 1551.0,
"completions/min_terminated_length": 1551.0,
"entropy": 0.4284479096531868,
"epoch": 0.00392,
"frac_reward_zero_std": 0.75,
"grad_norm": 1.0771631002426147,
"kl": 0.046674114651978016,
"learning_rate": 5.999996989242791e-06,
"loss": -0.0014,
"num_tokens": 3701038.0,
"reward": 0.42624998092651367,
"reward_std": 0.13466876745224,
"rewards/rollout_reward_func/mean": 0.42624998092651367,
"rewards/rollout_reward_func/std": 0.3314265012741089,
"sampling/importance_sampling_ratio/max": 1.4823979139328003,
"sampling/importance_sampling_ratio/mean": 0.8060042858123779,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.11665940284729,
"sampling/sampling_logp_difference/mean": 0.0697537213563919,
"step": 49,
"step_time": 13.132170692999807
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2787.0,
"completions/max_terminated_length": 2787.0,
"completions/mean_length": 2460.71875,
"completions/mean_terminated_length": 2460.71875,
"completions/min_length": 2034.0,
"completions/min_terminated_length": 2034.0,
"entropy": 0.4269709587097168,
"epoch": 0.004,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.06742172688245773,
"kl": 0.05730041675269604,
"learning_rate": 5.999996508234369e-06,
"loss": 0.0008,
"num_tokens": 3793655.0,
"reward": 0.30000001192092896,
"reward_std": 0.0,
"rewards/rollout_reward_func/mean": 0.30000001192092896,
"rewards/rollout_reward_func/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.4013469219207764,
"sampling/importance_sampling_ratio/mean": 0.8128387928009033,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.065826416015625,
"sampling/sampling_logp_difference/mean": 0.07448764890432358,
"step": 50,
"step_time": 13.017520340999681
},
{
"clip_ratio/high_max": 0.03630952490493655,
"clip_ratio/high_mean": 0.012549603707157075,
"clip_ratio/low_mean": 0.0031565657118335366,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015706169069744647,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2785.0,
"completions/max_terminated_length": 2785.0,
"completions/mean_length": 1773.78125,
"completions/mean_terminated_length": 1773.78125,
"completions/min_length": 1054.0,
"completions/min_terminated_length": 1054.0,
"entropy": 0.37685880810022354,
"epoch": 0.00408,
"frac_reward_zero_std": 0.625,
"grad_norm": 1.299412727355957,
"kl": 0.04002719838172197,
"learning_rate": 5.999995991595729e-06,
"loss": -0.0109,
"num_tokens": 3862448.0,
"reward": 0.5353125333786011,
"reward_std": 0.08654377609491348,
"rewards/rollout_reward_func/mean": 0.5353125333786011,
"rewards/rollout_reward_func/std": 0.41608762741088867,
"sampling/importance_sampling_ratio/max": 2.3817224502563477,
"sampling/importance_sampling_ratio/mean": 0.9811595678329468,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.0328466892242432,
"sampling/sampling_logp_difference/mean": 0.06913870573043823,
"step": 51,
"step_time": 12.60499654799969
},
{
"clip_ratio/high_max": 0.03819444449618459,
"clip_ratio/high_mean": 0.015144050237722695,
"clip_ratio/low_mean": 0.00554396363440901,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02068801363930106,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2784.0,
"completions/max_terminated_length": 2784.0,
"completions/mean_length": 1786.09375,
"completions/mean_terminated_length": 1786.09375,
"completions/min_length": 1053.0,
"completions/min_terminated_length": 1053.0,
"entropy": 0.38082515448331833,
"epoch": 0.00416,
"frac_reward_zero_std": 0.375,
"grad_norm": 1.5871905088424683,
"kl": 0.06744291074573994,
"learning_rate": 5.999995439326883e-06,
"loss": -0.0699,
"num_tokens": 3931876.0,
"reward": 0.6090624928474426,
"reward_std": 0.26599711179733276,
"rewards/rollout_reward_func/mean": 0.6090624928474426,
"rewards/rollout_reward_func/std": 0.4591953456401825,
"sampling/importance_sampling_ratio/max": 2.734297752380371,
"sampling/importance_sampling_ratio/mean": 0.9665597677230835,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 2.065897226333618,
"sampling/sampling_logp_difference/mean": 0.06354629993438721,
"step": 52,
"step_time": 13.568785395000077
},
{
"clip_ratio/high_max": 0.022086466662585735,
"clip_ratio/high_mean": 0.008820227812975645,
"clip_ratio/low_mean": 0.0057043652050197124,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01452459313441068,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2789.0,
"completions/max_terminated_length": 2789.0,
"completions/mean_length": 1635.03125,
"completions/mean_terminated_length": 1635.03125,
"completions/min_length": 1053.0,
"completions/min_terminated_length": 1053.0,
"entropy": 0.36858493834733963,
"epoch": 0.00424,
"frac_reward_zero_std": 0.375,
"grad_norm": 1.9566222429275513,
"kl": 0.07718627620488405,
"learning_rate": 5.999994851427837e-06,
"loss": 0.0822,
"num_tokens": 3995868.0,
"reward": 0.6918749809265137,
"reward_std": 0.3203721046447754,
"rewards/rollout_reward_func/mean": 0.6918749809265137,
"rewards/rollout_reward_func/std": 0.4697249233722687,
"sampling/importance_sampling_ratio/max": 2.7838289737701416,
"sampling/importance_sampling_ratio/mean": 0.9136906266212463,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.7872750759124756,
"sampling/sampling_logp_difference/mean": 0.07199069857597351,
"step": 53,
"step_time": 12.247058065999909
},
{
"clip_ratio/high_max": 0.041652148589491844,
"clip_ratio/high_mean": 0.013425522716715932,
"clip_ratio/low_mean": 0.01002952002454549,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0234550426248461,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2784.0,
"completions/max_terminated_length": 2784.0,
"completions/mean_length": 1590.09375,
"completions/mean_terminated_length": 1590.09375,
"completions/min_length": 1053.0,
"completions/min_terminated_length": 1053.0,
"entropy": 0.37869949638843536,
"epoch": 0.00432,
"frac_reward_zero_std": 0.125,
"grad_norm": 2.3980886936187744,
"kl": 0.05773049034178257,
"learning_rate": 5.999994227898604e-06,
"loss": -0.0192,
"num_tokens": 4058303.0,
"reward": 0.4609374701976776,
"reward_std": 0.35279375314712524,
"rewards/rollout_reward_func/mean": 0.4609374701976776,
"rewards/rollout_reward_func/std": 0.44058871269226074,
"sampling/importance_sampling_ratio/max": 2.2311129570007324,
"sampling/importance_sampling_ratio/mean": 0.9393452405929565,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 0.9994411468505859,
"sampling/sampling_logp_difference/mean": 0.08165294677019119,
"step": 54,
"step_time": 11.528886767999893
},
{
"clip_ratio/high_max": 0.02447916753590107,
"clip_ratio/high_mean": 0.0075732802506536245,
"clip_ratio/low_mean": 0.00947712454944849,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01705040503293276,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2444.0,
"completions/max_terminated_length": 2444.0,
"completions/mean_length": 1789.75,
"completions/mean_terminated_length": 1789.75,
"completions/min_length": 1055.0,
"completions/min_terminated_length": 1055.0,
"entropy": 0.36519913375377655,
"epoch": 0.0044,
"frac_reward_zero_std": 0.375,
"grad_norm": 2.2972187995910645,
"kl": 0.05506392475217581,
"learning_rate": 5.99999356873919e-06,
"loss": -0.1185,
"num_tokens": 4127411.0,
"reward": 0.40562498569488525,
"reward_std": 0.22391541302204132,
"rewards/rollout_reward_func/mean": 0.40562498569488525,
"rewards/rollout_reward_func/std": 0.3422500193119049,
"sampling/importance_sampling_ratio/max": 2.4115021228790283,
"sampling/importance_sampling_ratio/mean": 0.9461013674736023,
"sampling/importance_sampling_ratio/min": 0.14957794547080994,
"sampling/sampling_logp_difference/max": 1.0122857093811035,
"sampling/sampling_logp_difference/mean": 0.06259442120790482,
"step": 55,
"step_time": 11.493340414999693
},
{
"clip_ratio/high_max": 0.0438775522634387,
"clip_ratio/high_mean": 0.012457483448088169,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.012457483448088169,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2799.0,
"completions/max_terminated_length": 2799.0,
"completions/mean_length": 2306.375,
"completions/mean_terminated_length": 2306.375,
"completions/min_length": 1567.0,
"completions/min_terminated_length": 1567.0,
"entropy": 0.40949854254722595,
"epoch": 0.00448,
"frac_reward_zero_std": 0.375,
"grad_norm": 2.023374319076538,
"kl": 0.08688413165509701,
"learning_rate": 5.999992873949609e-06,
"loss": -0.0712,
"num_tokens": 4214487.0,
"reward": 0.296875,
"reward_std": 0.08874999731779099,
"rewards/rollout_reward_func/mean": 0.296875,
"rewards/rollout_reward_func/std": 0.15228237211704254,
"sampling/importance_sampling_ratio/max": 2.9896316528320312,
"sampling/importance_sampling_ratio/mean": 0.968756377696991,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.835113763809204,
"sampling/sampling_logp_difference/mean": 0.08090537041425705,
"step": 56,
"step_time": 13.222404719000224
},
{
"clip_ratio/high_max": 0.04506416339427233,
"clip_ratio/high_mean": 0.01424223161302507,
"clip_ratio/low_mean": 0.002842377289198339,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01708460901863873,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2441.0,
"completions/max_terminated_length": 2441.0,
"completions/mean_length": 1963.0,
"completions/mean_terminated_length": 1963.0,
"completions/min_length": 1052.0,
"completions/min_terminated_length": 1052.0,
"entropy": 0.4080217182636261,
"epoch": 0.00456,
"frac_reward_zero_std": 0.375,
"grad_norm": 1.663516640663147,
"kl": 0.3106076046824455,
"learning_rate": 5.999992143529868e-06,
"loss": -0.0796,
"num_tokens": 4289619.0,
"reward": 0.3934375047683716,
"reward_std": 0.1563829779624939,
"rewards/rollout_reward_func/mean": 0.3934375047683716,
"rewards/rollout_reward_func/std": 0.30592650175094604,
"sampling/importance_sampling_ratio/max": 1.4833005666732788,
"sampling/importance_sampling_ratio/mean": 0.5795140862464905,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 3.2869999408721924,
"sampling/sampling_logp_difference/mean": 0.0979442298412323,
"step": 57,
"step_time": 11.762371722000125
},
{
"clip_ratio/high_max": 0.046875,
"clip_ratio/high_mean": 0.01171875,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01171875,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2802.0,
"completions/max_terminated_length": 2802.0,
"completions/mean_length": 1879.75,
"completions/mean_terminated_length": 1879.75,
"completions/min_length": 1053.0,
"completions/min_terminated_length": 1053.0,
"entropy": 0.3892976716160774,
"epoch": 0.00464,
"frac_reward_zero_std": 0.625,
"grad_norm": 1.1009422540664673,
"kl": 0.05097049381583929,
"learning_rate": 5.999991377479982e-06,
"loss": -0.0191,
"num_tokens": 4362090.0,
"reward": 0.5262500047683716,
"reward_std": 0.1875,
"rewards/rollout_reward_func/mean": 0.5262500047683716,
"rewards/rollout_reward_func/std": 0.4009806215763092,
"sampling/importance_sampling_ratio/max": 2.9964590072631836,
"sampling/importance_sampling_ratio/mean": 1.0218505859375,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.0378296375274658,
"sampling/sampling_logp_difference/mean": 0.06926104426383972,
"step": 58,
"step_time": 13.15352440300012
},
{
"clip_ratio/high_max": 0.019571688026189804,
"clip_ratio/high_mean": 0.004892922006547451,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004892922006547451,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2832.0,
"completions/max_terminated_length": 2832.0,
"completions/mean_length": 2202.375,
"completions/mean_terminated_length": 2202.375,
"completions/min_length": 1052.0,
"completions/min_terminated_length": 1052.0,
"entropy": 0.41450754553079605,
"epoch": 0.00472,
"frac_reward_zero_std": 0.625,
"grad_norm": 2.1054370403289795,
"kl": 0.03926007356494665,
"learning_rate": 5.999990575799961e-06,
"loss": 0.0595,
"num_tokens": 4446012.0,
"reward": 0.44343751668930054,
"reward_std": 0.13312500715255737,
"rewards/rollout_reward_func/mean": 0.44343751668930054,
"rewards/rollout_reward_func/std": 0.3428213894367218,
"sampling/importance_sampling_ratio/max": 2.236393928527832,
"sampling/importance_sampling_ratio/mean": 0.8590089678764343,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 0.9236248731613159,
"sampling/sampling_logp_difference/mean": 0.06904841959476471,
"step": 59,
"step_time": 13.58062329900008
},
{
"clip_ratio/high_max": 0.029240576550364494,
"clip_ratio/high_mean": 0.007310144137591124,
"clip_ratio/low_mean": 0.0030868902103975415,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010397034231573343,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2429.0,
"completions/max_terminated_length": 2429.0,
"completions/mean_length": 2124.9375,
"completions/mean_terminated_length": 2124.9375,
"completions/min_length": 1567.0,
"completions/min_terminated_length": 1567.0,
"entropy": 0.370839923620224,
"epoch": 0.0048,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.5435988903045654,
"kl": 0.08518982026726007,
"learning_rate": 5.99998973848982e-06,
"loss": -0.0579,
"num_tokens": 4527051.0,
"reward": 0.3590624928474426,
"reward_std": 0.06796419620513916,
"rewards/rollout_reward_func/mean": 0.3590624928474426,
"rewards/rollout_reward_func/std": 0.22809672355651855,
"sampling/importance_sampling_ratio/max": 2.2381787300109863,
"sampling/importance_sampling_ratio/mean": 0.8449472188949585,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.6413207054138184,
"sampling/sampling_logp_difference/mean": 0.069917693734169,
"step": 60,
"step_time": 11.73221161399988
},
{
"clip_ratio/high_max": 0.02281746082007885,
"clip_ratio/high_mean": 0.006954365293495357,
"clip_ratio/low_mean": 0.0037499999161809683,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010704364976845682,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2767.0,
"completions/max_terminated_length": 2767.0,
"completions/mean_length": 1689.3125,
"completions/mean_terminated_length": 1689.3125,
"completions/min_length": 1052.0,
"completions/min_terminated_length": 1052.0,
"entropy": 0.3677019253373146,
"epoch": 0.00488,
"frac_reward_zero_std": 0.375,
"grad_norm": 1.9829214811325073,
"kl": 0.058779667131602764,
"learning_rate": 5.999988865549569e-06,
"loss": 0.0304,
"num_tokens": 4593095.0,
"reward": 0.6549999713897705,
"reward_std": 0.22813192009925842,
"rewards/rollout_reward_func/mean": 0.6549999713897705,
"rewards/rollout_reward_func/std": 0.45271220803260803,
"sampling/importance_sampling_ratio/max": 1.883159875869751,
"sampling/importance_sampling_ratio/mean": 0.8463116884231567,
"sampling/importance_sampling_ratio/min": 0.22824469208717346,
"sampling/sampling_logp_difference/max": 1.781625747680664,
"sampling/sampling_logp_difference/mean": 0.06828776746988297,
"step": 61,
"step_time": 12.480462479000153
},
{
"clip_ratio/high_max": 0.0110975606366992,
"clip_ratio/high_mean": 0.0027743901591748,
"clip_ratio/low_mean": 0.0014880952658131719,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004262485424987972,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2815.0,
"completions/max_terminated_length": 2815.0,
"completions/mean_length": 2305.15625,
"completions/mean_terminated_length": 2305.15625,
"completions/min_length": 1571.0,
"completions/min_terminated_length": 1571.0,
"entropy": 0.4012472406029701,
"epoch": 0.00496,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.8583229780197144,
"kl": 0.06238031107932329,
"learning_rate": 5.999987956979225e-06,
"loss": -0.0392,
"num_tokens": 4680377.0,
"reward": 0.3434374928474426,
"reward_std": 0.08029377460479736,
"rewards/rollout_reward_func/mean": 0.3434374928474426,
"rewards/rollout_reward_func/std": 0.22245851159095764,
"sampling/importance_sampling_ratio/max": 2.7665059566497803,
"sampling/importance_sampling_ratio/mean": 0.9882571697235107,
"sampling/importance_sampling_ratio/min": 0.059778764843940735,
"sampling/sampling_logp_difference/max": 0.9543299674987793,
"sampling/sampling_logp_difference/mean": 0.0674777626991272,
"step": 62,
"step_time": 14.56621960199982
},
{
"clip_ratio/high_max": 0.05253623379394412,
"clip_ratio/high_mean": 0.016606280929408967,
"clip_ratio/low_mean": 0.0018382353009656072,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.018444516230374575,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2423.0,
"completions/max_terminated_length": 2423.0,
"completions/mean_length": 1968.9375,
"completions/mean_terminated_length": 1968.9375,
"completions/min_length": 1052.0,
"completions/min_terminated_length": 1052.0,
"entropy": 0.4196172505617142,
"epoch": 0.00504,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.905213475227356,
"kl": 0.09828684013336897,
"learning_rate": 5.999987012778799e-06,
"loss": -0.0034,
"num_tokens": 4755993.0,
"reward": 0.33031249046325684,
"reward_std": 0.09654378145933151,
"rewards/rollout_reward_func/mean": 0.33031249046325684,
"rewards/rollout_reward_func/std": 0.21877197921276093,
"sampling/importance_sampling_ratio/max": 2.0344085693359375,
"sampling/importance_sampling_ratio/mean": 0.7417819499969482,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 2.208054542541504,
"sampling/sampling_logp_difference/mean": 0.07868118584156036,
"step": 63,
"step_time": 11.736785162999922
},
{
"clip_ratio/high_max": 0.0055555556900799274,
"clip_ratio/high_mean": 0.0027777778450399637,
"clip_ratio/low_mean": 0.003794643096625805,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006572420941665769,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2814.0,
"completions/max_terminated_length": 2814.0,
"completions/mean_length": 1972.9375,
"completions/mean_terminated_length": 1972.9375,
"completions/min_length": 1053.0,
"completions/min_terminated_length": 1053.0,
"entropy": 0.4023704081773758,
"epoch": 0.00512,
"frac_reward_zero_std": 0.5,
"grad_norm": 1.3090136051177979,
"kl": 0.12237261980772018,
"learning_rate": 5.9999860329483104e-06,
"loss": -0.194,
"num_tokens": 4831827.0,
"reward": 0.5878125429153442,
"reward_std": 0.142506942152977,
"rewards/rollout_reward_func/mean": 0.5878125429153442,
"rewards/rollout_reward_func/std": 0.4488244950771332,
"sampling/importance_sampling_ratio/max": 2.9826838970184326,
"sampling/importance_sampling_ratio/mean": 0.9495848417282104,
"sampling/importance_sampling_ratio/min": 0.01580546610057354,
"sampling/sampling_logp_difference/max": 2.0490379333496094,
"sampling/sampling_logp_difference/mean": 0.07465916872024536,
"step": 64,
"step_time": 12.846850890000042
},
{
"clip_ratio/high_max": 0.021152781788259745,
"clip_ratio/high_mean": 0.006812585634179413,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006812585634179413,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2800.0,
"completions/max_terminated_length": 2800.0,
"completions/mean_length": 2005.0,
"completions/mean_terminated_length": 2005.0,
"completions/min_length": 1054.0,
"completions/min_terminated_length": 1054.0,
"entropy": 0.40342626720666885,
"epoch": 0.0052,
"frac_reward_zero_std": 0.625,
"grad_norm": 1.2556148767471313,
"kl": 0.0746797863394022,
"learning_rate": 5.999985017487771e-06,
"loss": -0.0305,
"num_tokens": 4908716.0,
"reward": 0.3818749785423279,
"reward_std": 0.15371949970722198,
"rewards/rollout_reward_func/mean": 0.3818749785423279,
"rewards/rollout_reward_func/std": 0.3037022650241852,
"sampling/importance_sampling_ratio/max": 2.2475836277008057,
"sampling/importance_sampling_ratio/mean": 0.8594139814376831,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.4058151245117188,
"sampling/sampling_logp_difference/mean": 0.06716296076774597,
"step": 65,
"step_time": 12.423915739000222
},
{
"clip_ratio/high_max": 0.03018707549199462,
"clip_ratio/high_mean": 0.007546768872998655,
"clip_ratio/low_mean": 0.006225198740139604,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.013771967613138258,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2792.0,
"completions/max_terminated_length": 2792.0,
"completions/mean_length": 2048.40625,
"completions/mean_terminated_length": 2048.40625,
"completions/min_length": 1055.0,
"completions/min_terminated_length": 1055.0,
"entropy": 0.38859760761260986,
"epoch": 0.00528,
"frac_reward_zero_std": 0.375,
"grad_norm": 2.1451351642608643,
"kl": 0.22367357090115547,
"learning_rate": 5.999983966397197e-06,
"loss": -0.1677,
"num_tokens": 4987207.0,
"reward": 0.4712499976158142,
"reward_std": 0.18434235453605652,
"rewards/rollout_reward_func/mean": 0.4712499976158142,
"rewards/rollout_reward_func/std": 0.35322248935699463,
"sampling/importance_sampling_ratio/max": 2.9422554969787598,
"sampling/importance_sampling_ratio/mean": 0.9915428161621094,
"sampling/importance_sampling_ratio/min": 0.016011416912078857,
"sampling/sampling_logp_difference/max": 2.497363805770874,
"sampling/sampling_logp_difference/mean": 0.07474374771118164,
"step": 66,
"step_time": 12.926716641999974
},
{
"clip_ratio/high_max": 0.01376319769769907,
"clip_ratio/high_mean": 0.005043363547883928,
"clip_ratio/low_mean": 0.0014880952658131719,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0065314588136971,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2783.0,
"completions/max_terminated_length": 2783.0,
"completions/mean_length": 1772.40625,
"completions/mean_terminated_length": 1772.40625,
"completions/min_length": 1055.0,
"completions/min_terminated_length": 1055.0,
"entropy": 0.4034885838627815,
"epoch": 0.00536,
"frac_reward_zero_std": 0.625,
"grad_norm": 1.5488033294677734,
"kl": 0.05104802828282118,
"learning_rate": 5.999982879676608e-06,
"loss": -0.041,
"num_tokens": 5055760.0,
"reward": 0.5737500190734863,
"reward_std": 0.16325795650482178,
"rewards/rollout_reward_func/mean": 0.5737500190734863,
"rewards/rollout_reward_func/std": 0.40885162353515625,
"sampling/importance_sampling_ratio/max": 2.278799057006836,
"sampling/importance_sampling_ratio/mean": 0.99635910987854,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 0.8835396766662598,
"sampling/sampling_logp_difference/mean": 0.06713330745697021,
"step": 67,
"step_time": 12.46692701400002
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.004360465100035071,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004360465100035071,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2796.0,
"completions/max_terminated_length": 2796.0,
"completions/mean_length": 2270.40625,
"completions/mean_terminated_length": 2270.40625,
"completions/min_length": 1983.0,
"completions/min_terminated_length": 1983.0,
"entropy": 0.4274456053972244,
"epoch": 0.00544,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.8523308038711548,
"kl": 0.13869191519916058,
"learning_rate": 5.9999817573260195e-06,
"loss": -0.1124,
"num_tokens": 5141713.0,
"reward": 0.2878125011920929,
"reward_std": 0.01750694215297699,
"rewards/rollout_reward_func/mean": 0.2878125011920929,
"rewards/rollout_reward_func/std": 0.03849880024790764,
"sampling/importance_sampling_ratio/max": 2.765258550643921,
"sampling/importance_sampling_ratio/mean": 0.8237208127975464,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.7204997539520264,
"sampling/sampling_logp_difference/mean": 0.08386299759149551,
"step": 68,
"step_time": 12.989918530000296
},
{
"clip_ratio/high_max": 0.04849738674238324,
"clip_ratio/high_mean": 0.016489425906911492,
"clip_ratio/low_mean": 0.0013888889225199819,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.017878314713016152,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2441.0,
"completions/max_terminated_length": 2441.0,
"completions/mean_length": 1876.46875,
"completions/mean_terminated_length": 1876.46875,
"completions/min_length": 1054.0,
"completions/min_terminated_length": 1054.0,
"entropy": 0.39091238379478455,
"epoch": 0.00552,
"frac_reward_zero_std": 0.625,
"grad_norm": 1.4318950176239014,
"kl": 0.09389345720410347,
"learning_rate": 5.999980599345448e-06,
"loss": -0.0356,
"num_tokens": 5214177.0,
"reward": 0.5806249976158142,
"reward_std": 0.07874999940395355,
"rewards/rollout_reward_func/mean": 0.5806249976158142,
"rewards/rollout_reward_func/std": 0.4241190552711487,
"sampling/importance_sampling_ratio/max": 1.9308501482009888,
"sampling/importance_sampling_ratio/mean": 0.9257422089576721,
"sampling/importance_sampling_ratio/min": 0.21397040784358978,
"sampling/sampling_logp_difference/max": 1.7655794620513916,
"sampling/sampling_logp_difference/mean": 0.06729073822498322,
"step": 69,
"step_time": 11.706464537000102
},
{
"clip_ratio/high_max": 0.012202381156384945,
"clip_ratio/high_mean": 0.0030505952890962362,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0030505952890962362,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2429.0,
"completions/max_terminated_length": 2429.0,
"completions/mean_length": 1901.125,
"completions/mean_terminated_length": 1901.125,
"completions/min_length": 1056.0,
"completions/min_terminated_length": 1056.0,
"entropy": 0.4174434766173363,
"epoch": 0.0056,
"frac_reward_zero_std": 0.625,
"grad_norm": 1.2482589483261108,
"kl": 0.0709009887650609,
"learning_rate": 5.999979405734914e-06,
"loss": -0.0875,
"num_tokens": 5287259.0,
"reward": 0.44999998807907104,
"reward_std": 0.19828803837299347,
"rewards/rollout_reward_func/mean": 0.44999998807907104,
"rewards/rollout_reward_func/std": 0.36232221126556396,
"sampling/importance_sampling_ratio/max": 2.240993022918701,
"sampling/importance_sampling_ratio/mean": 0.7815386652946472,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.9302306175231934,
"sampling/sampling_logp_difference/mean": 0.07109043747186661,
"step": 70,
"step_time": 11.92579563199979
},
{
"clip_ratio/high_max": 0.03573596617206931,
"clip_ratio/high_mean": 0.011878836317919195,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01361494732555002,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2800.0,
"completions/max_terminated_length": 2800.0,
"completions/mean_length": 2164.34375,
"completions/mean_terminated_length": 2164.34375,
"completions/min_length": 1055.0,
"completions/min_terminated_length": 1055.0,
"entropy": 0.4176176115870476,
"epoch": 0.00568,
"frac_reward_zero_std": 0.375,
"grad_norm": 1.604291319847107,
"kl": 0.0694936579093337,
"learning_rate": 5.999978176494435e-06,
"loss": -0.1233,
"num_tokens": 5369772.0,
"reward": 0.4762499928474426,
"reward_std": 0.21075797080993652,
"rewards/rollout_reward_func/mean": 0.4762499928474426,
"rewards/rollout_reward_func/std": 0.3796156644821167,
"sampling/importance_sampling_ratio/max": 2.497992992401123,
"sampling/importance_sampling_ratio/mean": 0.8422503471374512,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.5480012893676758,
"sampling/sampling_logp_difference/mean": 0.0728713721036911,
"step": 71,
"step_time": 12.894382659999792
},
{
"clip_ratio/high_max": 0.02281746082007885,
"clip_ratio/high_mean": 0.008831319864839315,
"clip_ratio/low_mean": 0.003260501311160624,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.012091821059584618,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2443.0,
"completions/max_terminated_length": 2443.0,
"completions/mean_length": 1719.03125,
"completions/mean_terminated_length": 1719.03125,
"completions/min_length": 1054.0,
"completions/min_terminated_length": 1054.0,
"entropy": 0.39671653509140015,
"epoch": 0.00576,
"frac_reward_zero_std": 0.5,
"grad_norm": 1.5779305696487427,
"kl": 0.08434087503701448,
"learning_rate": 5.99997691162403e-06,
"loss": -0.0636,
"num_tokens": 5436596.0,
"reward": 0.6024999618530273,
"reward_std": 0.2729267477989197,
"rewards/rollout_reward_func/mean": 0.6024999618530273,
"rewards/rollout_reward_func/std": 0.4505694806575775,
"sampling/importance_sampling_ratio/max": 2.754258632659912,
"sampling/importance_sampling_ratio/mean": 0.9120872020721436,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 2.2350893020629883,
"sampling/sampling_logp_difference/mean": 0.07831829786300659,
"step": 72,
"step_time": 12.011820702999785
},
{
"clip_ratio/high_max": 0.013888888992369175,
"clip_ratio/high_mean": 0.0034722222480922937,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005208333372138441,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2780.0,
"completions/max_terminated_length": 2780.0,
"completions/mean_length": 1720.125,
"completions/mean_terminated_length": 1720.125,
"completions/min_length": 1054.0,
"completions/min_terminated_length": 1054.0,
"entropy": 0.35359790176153183,
"epoch": 0.00584,
"frac_reward_zero_std": 0.375,
"grad_norm": 1.760622262954712,
"kl": 0.0876467265188694,
"learning_rate": 5.99997561112372e-06,
"loss": -0.0248,
"num_tokens": 5503333.0,
"reward": 0.7487499713897705,
"reward_std": 0.2987908720970154,
"rewards/rollout_reward_func/mean": 0.7487499713897705,
"rewards/rollout_reward_func/std": 0.46135663986206055,
"sampling/importance_sampling_ratio/max": 2.7877893447875977,
"sampling/importance_sampling_ratio/mean": 0.8998199701309204,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.3734312057495117,
"sampling/sampling_logp_difference/mean": 0.07869358360767365,
"step": 73,
"step_time": 13.119324159999906
},
{
"clip_ratio/high_max": 0.013701201416552067,
"clip_ratio/high_mean": 0.005070037324912846,
"clip_ratio/low_mean": 0.0030487803742289543,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.008118817582726479,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2825.0,
"completions/max_terminated_length": 2825.0,
"completions/mean_length": 1916.5,
"completions/mean_terminated_length": 1916.5,
"completions/min_length": 1053.0,
"completions/min_terminated_length": 1053.0,
"entropy": 0.38354693353176117,
"epoch": 0.00592,
"frac_reward_zero_std": 0.625,
"grad_norm": 0.8614935874938965,
"kl": 0.05992862023413181,
"learning_rate": 5.999974274993527e-06,
"loss": 0.012,
"num_tokens": 5576952.0,
"reward": 0.5674999952316284,
"reward_std": 0.20719751715660095,
"rewards/rollout_reward_func/mean": 0.5674999952316284,
"rewards/rollout_reward_func/std": 0.4317331612110138,
"sampling/importance_sampling_ratio/max": 1.6715911626815796,
"sampling/importance_sampling_ratio/mean": 0.8377959132194519,
"sampling/importance_sampling_ratio/min": 0.196980819106102,
"sampling/sampling_logp_difference/max": 0.913780689239502,
"sampling/sampling_logp_difference/mean": 0.06950188428163528,
"step": 74,
"step_time": 12.793109332000085
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2785.0,
"completions/max_terminated_length": 2785.0,
"completions/mean_length": 1587.53125,
"completions/mean_terminated_length": 1587.53125,
"completions/min_length": 1054.0,
"completions/min_terminated_length": 1054.0,
"entropy": 0.35842984169721603,
"epoch": 0.006,
"frac_reward_zero_std": 0.375,
"grad_norm": 1.3697266578674316,
"kl": 0.05413582641631365,
"learning_rate": 5.99997290323347e-06,
"loss": -0.0661,
"num_tokens": 5639305.0,
"reward": 0.7106249928474426,
"reward_std": 0.30803900957107544,
"rewards/rollout_reward_func/mean": 0.7106249928474426,
"rewards/rollout_reward_func/std": 0.4498884081840515,
"sampling/importance_sampling_ratio/max": 1.9579815864562988,
"sampling/importance_sampling_ratio/mean": 0.8830677270889282,
"sampling/importance_sampling_ratio/min": 0.15943297743797302,
"sampling/sampling_logp_difference/max": 1.0672590732574463,
"sampling/sampling_logp_difference/mean": 0.06444922834634781,
"step": 75,
"step_time": 12.032428736000156
},
{
"epoch": 0.006,
"eval_clip_ratio/high_max": 0.0,
"eval_clip_ratio/high_mean": 0.0,
"eval_clip_ratio/low_mean": 0.0,
"eval_clip_ratio/low_min": 0.0,
"eval_clip_ratio/region_mean": 0.0,
"eval_completions/clipped_ratio": 0.0,
"eval_completions/max_length": 2488.6,
"eval_completions/max_terminated_length": 2488.6,
"eval_completions/mean_length": 1951.175,
"eval_completions/mean_terminated_length": 1951.175,
"eval_completions/min_length": 1361.4,
"eval_completions/min_terminated_length": 1361.4,
"eval_entropy": 0.37236364781856535,
"eval_frac_reward_zero_std": 0.1,
"eval_kl": 0.07629953697323799,
"eval_loss": -0.0013724860036745667,
"eval_num_tokens": 5639305.0,
"eval_reward": 0.47524999976158144,
"eval_reward_std": 0.37039353847503664,
"eval_rewards/rollout_reward_func/mean": 0.47524999976158144,
"eval_rewards/rollout_reward_func/std": 0.37039353176951406,
"eval_runtime": 10.5117,
"eval_samples_per_second": 0.951,
"eval_sampling/importance_sampling_ratio/max": 1.6587244033813477,
"eval_sampling/importance_sampling_ratio/mean": 0.8837172389030457,
"eval_sampling/importance_sampling_ratio/min": 0.32487900257110597,
"eval_sampling/sampling_logp_difference/max": 0.8625046908855438,
"eval_sampling/sampling_logp_difference/mean": 0.06909476891160012,
"eval_steps_per_second": 0.285,
"step": 75
}
],
"logging_steps": 1.0,
"max_steps": 25000,
"num_input_tokens_seen": 5639305,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}