{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.006, "eval_steps": 500, "global_step": 75, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2783.0, "completions/max_terminated_length": 2783.0, "completions/mean_length": 2052.75, "completions/mean_terminated_length": 2052.75, "completions/min_length": 1055.0, "completions/min_terminated_length": 1055.0, "entropy": 0.4163087382912636, "epoch": 8e-05, "frac_reward_zero_std": 0.375, "grad_norm": 1.8663769960403442, "kl": 0.0, "learning_rate": 0.0, "loss": -0.0386, "num_tokens": 78630.0, "reward": 0.46406251192092896, "reward_std": 0.20054946839809418, "rewards/rollout_reward_func/mean": 0.46406251192092896, "rewards/rollout_reward_func/std": 0.37604784965515137, "sampling/importance_sampling_ratio/max": 2.1498024463653564, "sampling/importance_sampling_ratio/mean": 1.0975958108901978, "sampling/importance_sampling_ratio/min": 0.241215318441391, "sampling/sampling_logp_difference/max": 0.7405228614807129, "sampling/sampling_logp_difference/mean": 0.039819031953811646, "step": 1, "step_time": 14.418279634999976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2798.0, "completions/max_terminated_length": 2798.0, "completions/mean_length": 2084.21875, "completions/mean_terminated_length": 2084.21875, "completions/min_length": 1052.0, "completions/min_terminated_length": 1052.0, "entropy": 0.3995310440659523, "epoch": 0.00016, "frac_reward_zero_std": 0.25, "grad_norm": 2.142817735671997, "kl": 0.0, "learning_rate": 1.7142857142857143e-07, "loss": 0.016, "num_tokens": 158194.0, "reward": 0.3384375274181366, "reward_std": 0.16842570900917053, "rewards/rollout_reward_func/mean": 0.3384375274181366, "rewards/rollout_reward_func/std": 0.27340278029441833, "sampling/importance_sampling_ratio/max": 1.9602876901626587, "sampling/importance_sampling_ratio/mean": 0.992855966091156, "sampling/importance_sampling_ratio/min": 0.46628525853157043, "sampling/sampling_logp_difference/max": 0.6929764747619629, "sampling/sampling_logp_difference/mean": 0.04201715067028999, "step": 2, "step_time": 13.260429134000105 }, { "clip_ratio/high_max": 0.04315628902986646, "clip_ratio/high_mean": 0.012242560740560293, "clip_ratio/low_mean": 0.011964043835178018, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.024206604342907667, "completions/clipped_ratio": 0.0, "completions/max_length": 2776.0, "completions/max_terminated_length": 2776.0, "completions/mean_length": 1875.09375, "completions/mean_terminated_length": 1875.09375, "completions/min_length": 1052.0, "completions/min_terminated_length": 1052.0, "entropy": 0.38934508711099625, "epoch": 0.00024, "frac_reward_zero_std": 0.25, "grad_norm": 2.2311601638793945, "kl": 0.003617420152295381, "learning_rate": 3.4285714285714286e-07, "loss": -0.0954, "num_tokens": 230320.0, "reward": 0.4612500071525574, "reward_std": 0.22380851209163666, "rewards/rollout_reward_func/mean": 0.4612500071525574, "rewards/rollout_reward_func/std": 0.3984546363353729, "sampling/importance_sampling_ratio/max": 1.6067352294921875, "sampling/importance_sampling_ratio/mean": 0.9242645502090454, "sampling/importance_sampling_ratio/min": 0.17279618978500366, "sampling/sampling_logp_difference/max": 1.4119317531585693, "sampling/sampling_logp_difference/mean": 0.045969706028699875, "step": 3, "step_time": 12.304947445999915 }, { "clip_ratio/high_max": 0.023281023371964693, "clip_ratio/high_mean": 0.012716594734229147, "clip_ratio/low_mean": 0.01039634458720684, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.023112939670681953, "completions/clipped_ratio": 0.0, "completions/max_length": 2783.0, "completions/max_terminated_length": 2783.0, "completions/mean_length": 2251.84375, "completions/mean_terminated_length": 2251.84375, "completions/min_length": 1562.0, "completions/min_terminated_length": 1562.0, "entropy": 0.4188930094242096, "epoch": 0.00032, "frac_reward_zero_std": 0.25, "grad_norm": 2.3432626724243164, "kl": 0.005323103512637317, "learning_rate": 5.142857142857143e-07, "loss": -0.1037, "num_tokens": 315875.0, "reward": 0.2640625238418579, "reward_std": 0.07438889145851135, "rewards/rollout_reward_func/mean": 0.2640625238418579, "rewards/rollout_reward_func/std": 0.09810657054185867, "sampling/importance_sampling_ratio/max": 2.92923903465271, "sampling/importance_sampling_ratio/mean": 1.0071074962615967, "sampling/importance_sampling_ratio/min": 0.30356213450431824, "sampling/sampling_logp_difference/max": 0.9253432750701904, "sampling/sampling_logp_difference/mean": 0.04933081567287445, "step": 4, "step_time": 13.287707804000092 }, { "clip_ratio/high_max": 0.04239537985995412, "clip_ratio/high_mean": 0.018673060229048133, "clip_ratio/low_mean": 0.0042297979816794395, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.022902858443558216, "completions/clipped_ratio": 0.0, "completions/max_length": 2790.0, "completions/max_terminated_length": 2790.0, "completions/mean_length": 2197.3125, "completions/mean_terminated_length": 2197.3125, "completions/min_length": 1559.0, "completions/min_terminated_length": 1559.0, "entropy": 0.4414307102560997, "epoch": 0.0004, "frac_reward_zero_std": 0.25, "grad_norm": 2.470518112182617, "kl": 0.004553150560241193, "learning_rate": 6.857142857142857e-07, "loss": 0.1372, "num_tokens": 399370.0, "reward": 0.40281248092651367, "reward_std": 0.16662904620170593, "rewards/rollout_reward_func/mean": 0.40281248092651367, "rewards/rollout_reward_func/std": 0.3357921242713928, "sampling/importance_sampling_ratio/max": 2.2576870918273926, "sampling/importance_sampling_ratio/mean": 1.0002690553665161, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.462371826171875, "sampling/sampling_logp_difference/mean": 0.053694289177656174, "step": 5, "step_time": 13.068715858000132 }, { "clip_ratio/high_max": 0.02923969691619277, "clip_ratio/high_mean": 0.01021690119523555, "clip_ratio/low_mean": 0.01101089478470385, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.021227796096354723, "completions/clipped_ratio": 0.0, "completions/max_length": 2428.0, "completions/max_terminated_length": 2428.0, "completions/mean_length": 1826.09375, "completions/mean_terminated_length": 1826.09375, "completions/min_length": 1053.0, "completions/min_terminated_length": 1053.0, "entropy": 0.37763065844774246, "epoch": 0.00048, "frac_reward_zero_std": 0.0, "grad_norm": 2.4737789630889893, "kl": 0.003610707528423518, "learning_rate": 8.571428571428571e-07, "loss": 0.0212, "num_tokens": 469858.0, "reward": 0.4584375023841858, "reward_std": 0.2892817258834839, "rewards/rollout_reward_func/mean": 0.4584375023841858, "rewards/rollout_reward_func/std": 0.4035496413707733, "sampling/importance_sampling_ratio/max": 1.8672934770584106, "sampling/importance_sampling_ratio/mean": 0.9250987768173218, "sampling/importance_sampling_ratio/min": 0.2111542820930481, "sampling/sampling_logp_difference/max": 1.105020523071289, "sampling/sampling_logp_difference/mean": 0.04392547905445099, "step": 6, "step_time": 11.808095918000163 }, { "clip_ratio/high_max": 0.02163859363645315, "clip_ratio/high_mean": 0.007195362821221352, "clip_ratio/low_mean": 0.009288194705732167, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01648355764336884, "completions/clipped_ratio": 0.0, "completions/max_length": 2799.0, "completions/max_terminated_length": 2799.0, "completions/mean_length": 2101.9375, "completions/mean_terminated_length": 2101.9375, "completions/min_length": 1053.0, "completions/min_terminated_length": 1053.0, "entropy": 0.4031049609184265, "epoch": 0.00056, "frac_reward_zero_std": 0.375, "grad_norm": 2.241011142730713, "kl": 0.004900285159237683, "learning_rate": 1.0285714285714286e-06, "loss": 0.0307, "num_tokens": 549695.0, "reward": 0.32218751311302185, "reward_std": 0.10592572391033173, "rewards/rollout_reward_func/mean": 0.32218751311302185, "rewards/rollout_reward_func/std": 0.22224271297454834, "sampling/importance_sampling_ratio/max": 2.7520875930786133, "sampling/importance_sampling_ratio/mean": 0.9687752723693848, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8389774560928345, "sampling/sampling_logp_difference/mean": 0.043909620493650436, "step": 7, "step_time": 13.085814365000147 }, { "clip_ratio/high_max": 0.029183519072830677, "clip_ratio/high_mean": 0.008625667076557875, "clip_ratio/low_mean": 0.016130636679008603, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02475630398839712, "completions/clipped_ratio": 0.0, "completions/max_length": 2794.0, "completions/max_terminated_length": 2794.0, "completions/mean_length": 2250.71875, "completions/mean_terminated_length": 2250.71875, "completions/min_length": 1570.0, "completions/min_terminated_length": 1570.0, "entropy": 0.43513813614845276, "epoch": 0.00064, "frac_reward_zero_std": 0.25, "grad_norm": 2.6571757793426514, "kl": 0.0038885354879312217, "learning_rate": 1.2000000000000002e-06, "loss": -0.0896, "num_tokens": 634822.0, "reward": 0.30375000834465027, "reward_std": 0.11063194274902344, "rewards/rollout_reward_func/mean": 0.30375000834465027, "rewards/rollout_reward_func/std": 0.22577106952667236, "sampling/importance_sampling_ratio/max": 2.24173641204834, "sampling/importance_sampling_ratio/mean": 0.9777867794036865, "sampling/importance_sampling_ratio/min": 0.4010058343410492, "sampling/sampling_logp_difference/max": 0.9179394245147705, "sampling/sampling_logp_difference/mean": 0.0499531514942646, "step": 8, "step_time": 13.055169309000007 }, { "clip_ratio/high_max": 0.014742525294423103, "clip_ratio/high_mean": 0.003685631323605776, "clip_ratio/low_mean": 0.008176195668056607, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011861827224493027, "completions/clipped_ratio": 0.0, "completions/max_length": 2810.0, "completions/max_terminated_length": 2810.0, "completions/mean_length": 1657.03125, "completions/mean_terminated_length": 1657.03125, "completions/min_length": 1053.0, "completions/min_terminated_length": 1053.0, "entropy": 0.39970337599515915, "epoch": 0.00072, "frac_reward_zero_std": 0.5, "grad_norm": 1.901354432106018, "kl": 0.005712996702641249, "learning_rate": 1.3714285714285715e-06, "loss": 0.0004, "num_tokens": 699616.0, "reward": 0.3528124690055847, "reward_std": 0.20240315794944763, "rewards/rollout_reward_func/mean": 0.3528124690055847, "rewards/rollout_reward_func/std": 0.3597510755062103, "sampling/importance_sampling_ratio/max": 2.387613296508789, "sampling/importance_sampling_ratio/mean": 1.0771517753601074, "sampling/importance_sampling_ratio/min": 0.5435174703598022, "sampling/sampling_logp_difference/max": 0.6833771467208862, "sampling/sampling_logp_difference/mean": 0.04181923717260361, "step": 9, "step_time": 13.294292020000057 }, { "clip_ratio/high_max": 0.03906210558488965, "clip_ratio/high_mean": 0.015391179244033992, "clip_ratio/low_mean": 0.0073633925057947636, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.022754571866244078, "completions/clipped_ratio": 0.0, "completions/max_length": 2793.0, "completions/max_terminated_length": 2793.0, "completions/mean_length": 2214.53125, "completions/mean_terminated_length": 2214.53125, "completions/min_length": 1052.0, "completions/min_terminated_length": 1052.0, "entropy": 0.41637370735406876, "epoch": 0.0008, "frac_reward_zero_std": 0.25, "grad_norm": 2.541189670562744, "kl": 0.004748326842673123, "learning_rate": 1.5428571428571428e-06, "loss": -0.0168, "num_tokens": 783707.0, "reward": 0.4284375309944153, "reward_std": 0.1260128915309906, "rewards/rollout_reward_func/mean": 0.4284375309944153, "rewards/rollout_reward_func/std": 0.3622608780860901, "sampling/importance_sampling_ratio/max": 2.2261769771575928, "sampling/importance_sampling_ratio/mean": 1.042180061340332, "sampling/importance_sampling_ratio/min": 0.2320551723241806, "sampling/sampling_logp_difference/max": 1.021528959274292, "sampling/sampling_logp_difference/mean": 0.04730905592441559, "step": 10, "step_time": 13.637110585000073 }, { "clip_ratio/high_max": 0.010990338400006294, "clip_ratio/high_mean": 0.0027475846000015736, "clip_ratio/low_mean": 0.0016025641234591603, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004350148723460734, "completions/clipped_ratio": 0.0, "completions/max_length": 2793.0, "completions/max_terminated_length": 2793.0, "completions/mean_length": 1966.4375, "completions/mean_terminated_length": 1966.4375, "completions/min_length": 1055.0, "completions/min_terminated_length": 1055.0, "entropy": 0.4171219617128372, "epoch": 0.00088, "frac_reward_zero_std": 0.625, "grad_norm": 1.4207873344421387, "kl": 0.0035471616429276764, "learning_rate": 1.7142857142857143e-06, "loss": 0.0422, "num_tokens": 858994.0, "reward": 0.4596875309944153, "reward_std": 0.14279377460479736, "rewards/rollout_reward_func/mean": 0.4596875309944153, "rewards/rollout_reward_func/std": 0.3725802004337311, "sampling/importance_sampling_ratio/max": 1.874053716659546, "sampling/importance_sampling_ratio/mean": 0.9027889966964722, "sampling/importance_sampling_ratio/min": 0.45684853196144104, "sampling/sampling_logp_difference/max": 0.5253086090087891, "sampling/sampling_logp_difference/mean": 0.0447448305785656, "step": 11, "step_time": 12.90678849699998 }, { "clip_ratio/high_max": 0.039288708940148354, "clip_ratio/high_mean": 0.017087680520489812, "clip_ratio/low_mean": 0.008439590455964208, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02552727097645402, "completions/clipped_ratio": 0.0, "completions/max_length": 2782.0, "completions/max_terminated_length": 2782.0, "completions/mean_length": 1859.5625, "completions/mean_terminated_length": 1859.5625, "completions/min_length": 1053.0, "completions/min_terminated_length": 1053.0, "entropy": 0.42384539544582367, "epoch": 0.00096, "frac_reward_zero_std": 0.375, "grad_norm": 2.1793887615203857, "kl": 0.004719441349152476, "learning_rate": 1.8857142857142858e-06, "loss": -0.0501, "num_tokens": 930647.0, "reward": 0.5653125047683716, "reward_std": 0.09132834523916245, "rewards/rollout_reward_func/mean": 0.5653125047683716, "rewards/rollout_reward_func/std": 0.4122869372367859, "sampling/importance_sampling_ratio/max": 1.968488097190857, "sampling/importance_sampling_ratio/mean": 1.1238960027694702, "sampling/importance_sampling_ratio/min": 0.5891481637954712, "sampling/sampling_logp_difference/max": 0.9189000129699707, "sampling/sampling_logp_difference/mean": 0.045362215489149094, "step": 12, "step_time": 12.105040577999944 }, { "clip_ratio/high_max": 0.03163956617936492, "clip_ratio/high_mean": 0.009185401839204133, "clip_ratio/low_mean": 0.01267810445278883, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.021863506408408284, "completions/clipped_ratio": 0.0, "completions/max_length": 2794.0, "completions/max_terminated_length": 2794.0, "completions/mean_length": 1744.0625, "completions/mean_terminated_length": 1744.0625, "completions/min_length": 1053.0, "completions/min_terminated_length": 1053.0, "entropy": 0.38077671080827713, "epoch": 0.00104, "frac_reward_zero_std": 0.0, "grad_norm": 2.601933240890503, "kl": 0.0043588640401139855, "learning_rate": 2.0571428571428573e-06, "loss": -0.0271, "num_tokens": 998268.0, "reward": 0.5040624737739563, "reward_std": 0.3648141622543335, "rewards/rollout_reward_func/mean": 0.5040624737739563, "rewards/rollout_reward_func/std": 0.4420104920864105, "sampling/importance_sampling_ratio/max": 2.2825241088867188, "sampling/importance_sampling_ratio/mean": 1.0028969049453735, "sampling/importance_sampling_ratio/min": 0.37051475048065186, "sampling/sampling_logp_difference/max": 0.6929263472557068, "sampling/sampling_logp_difference/mean": 0.043037254363298416, "step": 13, "step_time": 12.434848872999964 }, { "clip_ratio/high_max": 0.0682385629042983, "clip_ratio/high_mean": 0.022985405288636684, "clip_ratio/low_mean": 0.0055555556900799274, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.028540961910039186, "completions/clipped_ratio": 0.0, "completions/max_length": 2797.0, "completions/max_terminated_length": 2797.0, "completions/mean_length": 2003.40625, "completions/mean_terminated_length": 2003.40625, "completions/min_length": 1052.0, "completions/min_terminated_length": 1052.0, "entropy": 0.40549617260694504, "epoch": 0.00112, "frac_reward_zero_std": 0.125, "grad_norm": 2.6730706691741943, "kl": 0.004465080099180341, "learning_rate": 2.2285714285714286e-06, "loss": 0.0367, "num_tokens": 1075200.0, "reward": 0.3971875011920929, "reward_std": 0.24656714498996735, "rewards/rollout_reward_func/mean": 0.3971875011920929, "rewards/rollout_reward_func/std": 0.3921506702899933, "sampling/importance_sampling_ratio/max": 2.08994197845459, "sampling/importance_sampling_ratio/mean": 0.9472236037254333, "sampling/importance_sampling_ratio/min": 0.2920815646648407, "sampling/sampling_logp_difference/max": 0.5747750997543335, "sampling/sampling_logp_difference/mean": 0.0427585169672966, "step": 14, "step_time": 13.12732943400033 }, { "clip_ratio/high_max": 0.04484127042815089, "clip_ratio/high_mean": 0.01285505446139723, "clip_ratio/low_mean": 0.008134920848533511, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.020989975426346064, "completions/clipped_ratio": 0.0, "completions/max_length": 2801.0, "completions/max_terminated_length": 2801.0, "completions/mean_length": 1879.96875, "completions/mean_terminated_length": 1879.96875, "completions/min_length": 1052.0, "completions/min_terminated_length": 1052.0, "entropy": 0.3704817444086075, "epoch": 0.0012, "frac_reward_zero_std": 0.375, "grad_norm": 1.3741340637207031, "kl": 0.004930144699756056, "learning_rate": 2.4000000000000003e-06, "loss": -0.036, "num_tokens": 1147654.0, "reward": 0.5634374618530273, "reward_std": 0.2032102644443512, "rewards/rollout_reward_func/mean": 0.5634374618530273, "rewards/rollout_reward_func/std": 0.4479026794433594, "sampling/importance_sampling_ratio/max": 1.419919490814209, "sampling/importance_sampling_ratio/mean": 0.8213506937026978, "sampling/importance_sampling_ratio/min": 0.2297196239233017, "sampling/sampling_logp_difference/max": 0.9635820388793945, "sampling/sampling_logp_difference/mean": 0.04450097680091858, "step": 15, "step_time": 12.787183091000088 }, { "clip_ratio/high_max": 0.040403091348707676, "clip_ratio/high_mean": 0.019732415094040334, "clip_ratio/low_mean": 0.011093285749666393, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.030825700610876083, "completions/clipped_ratio": 0.0, "completions/max_length": 2812.0, "completions/max_terminated_length": 2812.0, "completions/mean_length": 2226.75, "completions/mean_terminated_length": 2226.75, "completions/min_length": 1054.0, "completions/min_terminated_length": 1054.0, "entropy": 0.4390428438782692, "epoch": 0.00128, "frac_reward_zero_std": 0.125, "grad_norm": 3.3430113792419434, "kl": 0.005434123100712895, "learning_rate": 2.571428571428571e-06, "loss": 0.0608, "num_tokens": 1232479.0, "reward": 0.35874998569488525, "reward_std": 0.16885429620742798, "rewards/rollout_reward_func/mean": 0.35874998569488525, "rewards/rollout_reward_func/std": 0.31368517875671387, "sampling/importance_sampling_ratio/max": 2.1880178451538086, "sampling/importance_sampling_ratio/mean": 0.9618589878082275, "sampling/importance_sampling_ratio/min": 0.12961336970329285, "sampling/sampling_logp_difference/max": 0.941362738609314, "sampling/sampling_logp_difference/mean": 0.05188923329114914, "step": 16, "step_time": 13.173405885999728 }, { "clip_ratio/high_max": 0.04390919208526611, "clip_ratio/high_mean": 0.017439239425584674, "clip_ratio/low_mean": 0.007801226573064923, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.025240465998649597, "completions/clipped_ratio": 0.0, "completions/max_length": 2434.0, "completions/max_terminated_length": 2434.0, "completions/mean_length": 1750.0625, "completions/mean_terminated_length": 1750.0625, "completions/min_length": 1052.0, "completions/min_terminated_length": 1052.0, "entropy": 0.3980557546019554, "epoch": 0.00136, "frac_reward_zero_std": 0.0, "grad_norm": 2.602077007293701, "kl": 0.004334585275501013, "learning_rate": 2.742857142857143e-06, "loss": -0.0264, "num_tokens": 1300635.0, "reward": 0.38499999046325684, "reward_std": 0.221183180809021, "rewards/rollout_reward_func/mean": 0.38499999046325684, "rewards/rollout_reward_func/std": 0.34839722514152527, "sampling/importance_sampling_ratio/max": 1.7235372066497803, "sampling/importance_sampling_ratio/mean": 0.9467421770095825, "sampling/importance_sampling_ratio/min": 0.2654297649860382, "sampling/sampling_logp_difference/max": 0.7773740887641907, "sampling/sampling_logp_difference/mean": 0.04712219163775444, "step": 17, "step_time": 11.411276259999795 }, { "clip_ratio/high_max": 0.04594441968947649, "clip_ratio/high_mean": 0.013718247646465898, "clip_ratio/low_mean": 0.004949534311890602, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018667781492695212, "completions/clipped_ratio": 0.0, "completions/max_length": 2798.0, "completions/max_terminated_length": 2798.0, "completions/mean_length": 2309.09375, "completions/mean_terminated_length": 2309.09375, "completions/min_length": 1055.0, "completions/min_terminated_length": 1055.0, "entropy": 0.4307108670473099, "epoch": 0.00144, "frac_reward_zero_std": 0.375, "grad_norm": 1.9505234956741333, "kl": 0.004669323505368084, "learning_rate": 2.9142857142857142e-06, "loss": 0.0981, "num_tokens": 1388529.0, "reward": 0.3696874976158142, "reward_std": 0.155008003115654, "rewards/rollout_reward_func/mean": 0.3696874976158142, "rewards/rollout_reward_func/std": 0.28414538502693176, "sampling/importance_sampling_ratio/max": 1.8336728811264038, "sampling/importance_sampling_ratio/mean": 0.9352109432220459, "sampling/importance_sampling_ratio/min": 0.28059616684913635, "sampling/sampling_logp_difference/max": 1.0694303512573242, "sampling/sampling_logp_difference/mean": 0.05270082503557205, "step": 18, "step_time": 13.463537552000162 }, { "clip_ratio/high_max": 0.03351574344560504, "clip_ratio/high_mean": 0.017963151913136244, "clip_ratio/low_mean": 0.005672972998581827, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.023636124562472105, "completions/clipped_ratio": 0.0, "completions/max_length": 2782.0, "completions/max_terminated_length": 2782.0, "completions/mean_length": 2203.90625, "completions/mean_terminated_length": 2203.90625, "completions/min_length": 1564.0, "completions/min_terminated_length": 1564.0, "entropy": 0.4163732975721359, "epoch": 0.00152, "frac_reward_zero_std": 0.25, "grad_norm": 2.54580020904541, "kl": 0.0039027896127663553, "learning_rate": 3.0857142857142855e-06, "loss": -0.0385, "num_tokens": 1472480.0, "reward": 0.2887499928474426, "reward_std": 0.1067335307598114, "rewards/rollout_reward_func/mean": 0.2887499928474426, "rewards/rollout_reward_func/std": 0.17496080696582794, "sampling/importance_sampling_ratio/max": 2.46917724609375, "sampling/importance_sampling_ratio/mean": 1.0520013570785522, "sampling/importance_sampling_ratio/min": 0.31319668889045715, "sampling/sampling_logp_difference/max": 0.6751515865325928, "sampling/sampling_logp_difference/mean": 0.04795370250940323, "step": 19, "step_time": 12.881922476999762 }, { "clip_ratio/high_max": 0.03289473615586758, "clip_ratio/high_mean": 0.016854635905474424, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.016854635905474424, "completions/clipped_ratio": 0.0, "completions/max_length": 2803.0, "completions/max_terminated_length": 2803.0, "completions/mean_length": 1839.5625, "completions/mean_terminated_length": 1839.5625, "completions/min_length": 1053.0, "completions/min_terminated_length": 1053.0, "entropy": 0.3896697387099266, "epoch": 0.0016, "frac_reward_zero_std": 0.5, "grad_norm": 3.0274949073791504, "kl": 0.004332752665504813, "learning_rate": 3.257142857142857e-06, "loss": 0.1087, "num_tokens": 1543546.0, "reward": 0.4571874737739563, "reward_std": 0.20620864629745483, "rewards/rollout_reward_func/mean": 0.4571874737739563, "rewards/rollout_reward_func/std": 0.38446637988090515, "sampling/importance_sampling_ratio/max": 2.2497854232788086, "sampling/importance_sampling_ratio/mean": 0.9864073395729065, "sampling/importance_sampling_ratio/min": 0.3370327055454254, "sampling/sampling_logp_difference/max": 0.9195313453674316, "sampling/sampling_logp_difference/mean": 0.04598519578576088, "step": 20, "step_time": 13.28698261799991 }, { "clip_ratio/high_max": 0.047807968221604824, "clip_ratio/high_mean": 0.017668311716988683, "clip_ratio/low_mean": 0.0087070451118052, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.026375357527285814, "completions/clipped_ratio": 0.0, "completions/max_length": 2784.0, "completions/max_terminated_length": 2784.0, "completions/mean_length": 1916.84375, "completions/mean_terminated_length": 1916.84375, "completions/min_length": 1052.0, "completions/min_terminated_length": 1052.0, "entropy": 0.3857065215706825, "epoch": 0.00168, "frac_reward_zero_std": 0.125, "grad_norm": 2.2928388118743896, "kl": 0.0030007859459146857, "learning_rate": 3.4285714285714285e-06, "loss": 0.0691, "num_tokens": 1617299.0, "reward": 0.5199999809265137, "reward_std": 0.24810142815113068, "rewards/rollout_reward_func/mean": 0.5199999809265137, "rewards/rollout_reward_func/std": 0.44394853711128235, "sampling/importance_sampling_ratio/max": 1.9692103862762451, "sampling/importance_sampling_ratio/mean": 1.0206944942474365, "sampling/importance_sampling_ratio/min": 0.37676262855529785, "sampling/sampling_logp_difference/max": 0.5263509750366211, "sampling/sampling_logp_difference/mean": 0.04122690111398697, "step": 21, "step_time": 12.188486421000334 }, { "clip_ratio/high_max": 0.040458154398947954, "clip_ratio/high_mean": 0.011364538804627955, "clip_ratio/low_mean": 0.006526540499180555, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.017891079653054476, "completions/clipped_ratio": 0.0, "completions/max_length": 2801.0, "completions/max_terminated_length": 2801.0, "completions/mean_length": 1878.4375, "completions/mean_terminated_length": 1878.4375, "completions/min_length": 1052.0, "completions/min_terminated_length": 1052.0, "entropy": 0.3931129276752472, "epoch": 0.00176, "frac_reward_zero_std": 0.125, "grad_norm": 1.7913806438446045, "kl": 0.004387594643048942, "learning_rate": 3.6e-06, "loss": -0.0657, "num_tokens": 1690165.0, "reward": 0.48281246423721313, "reward_std": 0.24489575624465942, "rewards/rollout_reward_func/mean": 0.48281246423721313, "rewards/rollout_reward_func/std": 0.42833685874938965, "sampling/importance_sampling_ratio/max": 2.001044750213623, "sampling/importance_sampling_ratio/mean": 0.8716533780097961, "sampling/importance_sampling_ratio/min": 0.21946659684181213, "sampling/sampling_logp_difference/max": 0.6549723148345947, "sampling/sampling_logp_difference/mean": 0.04290828853845596, "step": 22, "step_time": 12.900562208999872 }, { "clip_ratio/high_max": 0.012908496893942356, "clip_ratio/high_mean": 0.003227124223485589, "clip_ratio/low_mean": 0.0013888889225199819, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004616013146005571, "completions/clipped_ratio": 0.0, "completions/max_length": 2795.0, "completions/max_terminated_length": 2795.0, "completions/mean_length": 1921.96875, "completions/mean_terminated_length": 1921.96875, "completions/min_length": 1053.0, "completions/min_terminated_length": 1053.0, "entropy": 0.3991682603955269, "epoch": 0.00184, "frac_reward_zero_std": 0.375, "grad_norm": 2.757072687149048, "kl": 0.002871026110369712, "learning_rate": 3.7714285714285716e-06, "loss": -0.0322, "num_tokens": 1764351.0, "reward": 0.4725000262260437, "reward_std": 0.15098075568675995, "rewards/rollout_reward_func/mean": 0.4725000262260437, "rewards/rollout_reward_func/std": 0.3937331438064575, "sampling/importance_sampling_ratio/max": 2.473691463470459, "sampling/importance_sampling_ratio/mean": 1.0277502536773682, "sampling/importance_sampling_ratio/min": 0.3683130145072937, "sampling/sampling_logp_difference/max": 0.8325839042663574, "sampling/sampling_logp_difference/mean": 0.041013769805431366, "step": 23, "step_time": 12.923486550999996 }, { "clip_ratio/high_max": 0.022044573910534382, "clip_ratio/high_mean": 0.0072472544852644205, "clip_ratio/low_mean": 0.007787698996253312, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015034952783025801, "completions/clipped_ratio": 0.0, "completions/max_length": 2791.0, "completions/max_terminated_length": 2791.0, "completions/mean_length": 2014.1875, "completions/mean_terminated_length": 2014.1875, "completions/min_length": 1054.0, "completions/min_terminated_length": 1054.0, "entropy": 0.39251676946878433, "epoch": 0.00192, "frac_reward_zero_std": 0.375, "grad_norm": 1.9654884338378906, "kl": 0.0081728242803365, "learning_rate": 3.942857142857143e-06, "loss": -0.0383, "num_tokens": 1841628.0, "reward": 0.35874998569488525, "reward_std": 0.21719886362552643, "rewards/rollout_reward_func/mean": 0.35874998569488525, "rewards/rollout_reward_func/std": 0.31252095103263855, "sampling/importance_sampling_ratio/max": 2.0834484100341797, "sampling/importance_sampling_ratio/mean": 0.9893499612808228, "sampling/importance_sampling_ratio/min": 0.06596492230892181, "sampling/sampling_logp_difference/max": 1.764291524887085, "sampling/sampling_logp_difference/mean": 0.05037356913089752, "step": 24, "step_time": 12.55188547500029 }, { "clip_ratio/high_max": 0.03194444486871362, "clip_ratio/high_mean": 0.009474206599406898, "clip_ratio/low_mean": 0.004620927385985851, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014095134101808071, "completions/clipped_ratio": 0.0, "completions/max_length": 2432.0, "completions/max_terminated_length": 2432.0, "completions/mean_length": 1997.46875, "completions/mean_terminated_length": 1997.46875, "completions/min_length": 1052.0, "completions/min_terminated_length": 1052.0, "entropy": 0.40253835916519165, "epoch": 0.002, "frac_reward_zero_std": 0.5, "grad_norm": 2.1649582386016846, "kl": 0.007934511464554816, "learning_rate": 4.114285714285715e-06, "loss": -0.084, "num_tokens": 1918276.0, "reward": 0.3425000011920929, "reward_std": 0.16030071675777435, "rewards/rollout_reward_func/mean": 0.3425000011920929, "rewards/rollout_reward_func/std": 0.27845191955566406, "sampling/importance_sampling_ratio/max": 1.7379083633422852, "sampling/importance_sampling_ratio/mean": 1.0123233795166016, "sampling/importance_sampling_ratio/min": 0.21978217363357544, "sampling/sampling_logp_difference/max": 0.9820888042449951, "sampling/sampling_logp_difference/mean": 0.043975915759801865, "step": 25, "step_time": 11.6416912709999 }, { "clip_ratio/high_max": 0.057189542800188065, "clip_ratio/high_mean": 0.02329625654965639, "clip_ratio/low_mean": 0.008795286994427443, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03209154261276126, "completions/clipped_ratio": 0.0, "completions/max_length": 2812.0, "completions/max_terminated_length": 2812.0, "completions/mean_length": 2010.0625, "completions/mean_terminated_length": 2010.0625, "completions/min_length": 1053.0, "completions/min_terminated_length": 1053.0, "entropy": 0.3653796315193176, "epoch": 0.00208, "frac_reward_zero_std": 0.125, "grad_norm": 2.1230344772338867, "kl": 0.006589570315554738, "learning_rate": 4.285714285714286e-06, "loss": -0.0197, "num_tokens": 1995372.0, "reward": 0.4256249964237213, "reward_std": 0.23703671991825104, "rewards/rollout_reward_func/mean": 0.4256249964237213, "rewards/rollout_reward_func/std": 0.3602412939071655, "sampling/importance_sampling_ratio/max": 1.7632914781570435, "sampling/importance_sampling_ratio/mean": 0.9213794469833374, "sampling/importance_sampling_ratio/min": 0.4378761649131775, "sampling/sampling_logp_difference/max": 0.56688392162323, "sampling/sampling_logp_difference/mean": 0.03944293037056923, "step": 26, "step_time": 13.15109654299954 }, { "clip_ratio/high_max": 0.04949874710291624, "clip_ratio/high_mean": 0.02149919094517827, "clip_ratio/low_mean": 0.00766741088591516, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02916660183109343, "completions/clipped_ratio": 0.0, "completions/max_length": 2785.0, "completions/max_terminated_length": 2785.0, "completions/mean_length": 1842.0625, "completions/mean_terminated_length": 1842.0625, "completions/min_length": 1053.0, "completions/min_terminated_length": 1053.0, "entropy": 0.4269302785396576, "epoch": 0.00216, "frac_reward_zero_std": 0.25, "grad_norm": 1.6013935804367065, "kl": 0.00617267657071352, "learning_rate": 4.457142857142857e-06, "loss": -0.0345, "num_tokens": 2066465.0, "reward": 0.5221875309944153, "reward_std": 0.22779378294944763, "rewards/rollout_reward_func/mean": 0.5221875309944153, "rewards/rollout_reward_func/std": 0.4334239661693573, "sampling/importance_sampling_ratio/max": 2.312187433242798, "sampling/importance_sampling_ratio/mean": 0.8621585369110107, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9948511123657227, "sampling/sampling_logp_difference/mean": 0.051924653351306915, "step": 27, "step_time": 12.681567872999949 }, { "clip_ratio/high_max": 0.04371212236583233, "clip_ratio/high_mean": 0.0183574166148901, "clip_ratio/low_mean": 0.005908275721594691, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.024265691870823503, "completions/clipped_ratio": 0.0, "completions/max_length": 2796.0, "completions/max_terminated_length": 2796.0, "completions/mean_length": 2155.5, "completions/mean_terminated_length": 2155.5, "completions/min_length": 1053.0, "completions/min_terminated_length": 1053.0, "entropy": 0.41429970413446426, "epoch": 0.00224, "frac_reward_zero_std": 0.25, "grad_norm": 2.647275447845459, "kl": 0.010079714236781001, "learning_rate": 4.628571428571429e-06, "loss": -0.0864, "num_tokens": 2148817.0, "reward": 0.3021875023841858, "reward_std": 0.11279378086328506, "rewards/rollout_reward_func/mean": 0.3021875023841858, "rewards/rollout_reward_func/std": 0.23064753413200378, "sampling/importance_sampling_ratio/max": 2.1843345165252686, "sampling/importance_sampling_ratio/mean": 0.9328470230102539, "sampling/importance_sampling_ratio/min": 0.11585874110460281, "sampling/sampling_logp_difference/max": 1.9821176528930664, "sampling/sampling_logp_difference/mean": 0.05276907980442047, "step": 28, "step_time": 12.799536062000016 }, { "clip_ratio/high_max": 0.039141415152698755, "clip_ratio/high_mean": 0.019034530967473984, "clip_ratio/low_mean": 0.005208333372138441, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02424286410678178, "completions/clipped_ratio": 0.0, "completions/max_length": 2411.0, "completions/max_terminated_length": 2411.0, "completions/mean_length": 1544.21875, "completions/mean_terminated_length": 1544.21875, "completions/min_length": 1052.0, "completions/min_terminated_length": 1052.0, "entropy": 0.38873114436864853, "epoch": 0.00232, "frac_reward_zero_std": 0.0, "grad_norm": 2.288419485092163, "kl": 0.008441059850156307, "learning_rate": 4.800000000000001e-06, "loss": -0.0294, "num_tokens": 2209518.0, "reward": 0.5049999952316284, "reward_std": 0.367961049079895, "rewards/rollout_reward_func/mean": 0.5049999952316284, "rewards/rollout_reward_func/std": 0.4586867392063141, "sampling/importance_sampling_ratio/max": 1.7176055908203125, "sampling/importance_sampling_ratio/mean": 0.8919655084609985, "sampling/importance_sampling_ratio/min": 0.3174732029438019, "sampling/sampling_logp_difference/max": 1.007685899734497, "sampling/sampling_logp_difference/mean": 0.043198756873607635, "step": 29, "step_time": 11.569315259000177 }, { "clip_ratio/high_max": 0.03119284799322486, "clip_ratio/high_mean": 0.009251700364984572, "clip_ratio/low_mean": 0.0032051282469183207, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012456828728318214, "completions/clipped_ratio": 0.0, "completions/max_length": 2776.0, "completions/max_terminated_length": 2776.0, "completions/mean_length": 1695.40625, "completions/mean_terminated_length": 1695.40625, "completions/min_length": 1052.0, "completions/min_terminated_length": 1052.0, "entropy": 0.38929111510515213, "epoch": 0.0024, "frac_reward_zero_std": 0.25, "grad_norm": 1.749756932258606, "kl": 0.01017191493883729, "learning_rate": 4.9714285714285715e-06, "loss": 0.0146, "num_tokens": 2275561.0, "reward": 0.5309374928474426, "reward_std": 0.32216140627861023, "rewards/rollout_reward_func/mean": 0.5309374928474426, "rewards/rollout_reward_func/std": 0.4390852451324463, "sampling/importance_sampling_ratio/max": 2.9540531635284424, "sampling/importance_sampling_ratio/mean": 1.0208276510238647, "sampling/importance_sampling_ratio/min": 0.37041175365448, "sampling/sampling_logp_difference/max": 0.5885751247406006, "sampling/sampling_logp_difference/mean": 0.04683335870504379, "step": 30, "step_time": 12.191692169999897 }, { "clip_ratio/high_max": 0.05563905602321029, "clip_ratio/high_mean": 0.01747169380541891, "clip_ratio/low_mean": 0.008184524020180106, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02565621805842966, "completions/clipped_ratio": 0.0, "completions/max_length": 2783.0, "completions/max_terminated_length": 2783.0, "completions/mean_length": 1801.09375, "completions/mean_terminated_length": 1801.09375, "completions/min_length": 1053.0, "completions/min_terminated_length": 1053.0, "entropy": 0.3914438411593437, "epoch": 0.00248, "frac_reward_zero_std": 0.125, "grad_norm": 2.8585875034332275, "kl": 0.015274998731911182, "learning_rate": 5.142857142857142e-06, "loss": 0.0419, "num_tokens": 2345322.0, "reward": 0.36281251907348633, "reward_std": 0.2801453769207001, "rewards/rollout_reward_func/mean": 0.36281251907348633, "rewards/rollout_reward_func/std": 0.342911958694458, "sampling/importance_sampling_ratio/max": 2.163181781768799, "sampling/importance_sampling_ratio/mean": 0.9487945437431335, "sampling/importance_sampling_ratio/min": 0.29707521200180054, "sampling/sampling_logp_difference/max": 0.7824678421020508, "sampling/sampling_logp_difference/mean": 0.0532098188996315, "step": 31, "step_time": 13.19187305000014 }, { "clip_ratio/high_max": 0.03187447274103761, "clip_ratio/high_mean": 0.018647319404408336, "clip_ratio/low_mean": 0.004727297928184271, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02337461756542325, "completions/clipped_ratio": 0.0, "completions/max_length": 2435.0, "completions/max_terminated_length": 2435.0, "completions/mean_length": 1984.90625, "completions/mean_terminated_length": 1984.90625, "completions/min_length": 1052.0, "completions/min_terminated_length": 1052.0, "entropy": 0.416415698826313, "epoch": 0.00256, "frac_reward_zero_std": 0.25, "grad_norm": 2.3030495643615723, "kl": 0.015865659108385444, "learning_rate": 5.314285714285714e-06, "loss": -0.0567, "num_tokens": 2421421.0, "reward": 0.3878124952316284, "reward_std": 0.23157384991645813, "rewards/rollout_reward_func/mean": 0.3878124952316284, "rewards/rollout_reward_func/std": 0.3412286341190338, "sampling/importance_sampling_ratio/max": 2.5926010608673096, "sampling/importance_sampling_ratio/mean": 0.9760158658027649, "sampling/importance_sampling_ratio/min": 0.2061164528131485, "sampling/sampling_logp_difference/max": 0.8063008785247803, "sampling/sampling_logp_difference/mean": 0.04909588024020195, "step": 32, "step_time": 11.466194520999807 }, { "clip_ratio/high_max": 0.019717262126505375, "clip_ratio/high_mean": 0.004929315531626344, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004929315531626344, "completions/clipped_ratio": 0.0, "completions/max_length": 2809.0, "completions/max_terminated_length": 2809.0, "completions/mean_length": 2102.71875, "completions/mean_terminated_length": 2102.71875, "completions/min_length": 1054.0, "completions/min_terminated_length": 1054.0, "entropy": 0.42558059841394424, "epoch": 0.00264, "frac_reward_zero_std": 0.625, "grad_norm": 1.5914040803909302, "kl": 0.010543531039729714, "learning_rate": 5.485714285714286e-06, "loss": 0.0448, "num_tokens": 2501867.0, "reward": 0.5221875309944153, "reward_std": 0.14279377460479736, "rewards/rollout_reward_func/mean": 0.5221875309944153, "rewards/rollout_reward_func/std": 0.4007873833179474, "sampling/importance_sampling_ratio/max": 1.5994207859039307, "sampling/importance_sampling_ratio/mean": 0.8397550582885742, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9267706871032715, "sampling/sampling_logp_difference/mean": 0.0471554696559906, "step": 33, "step_time": 12.975996798000097 }, { "clip_ratio/high_max": 0.040178571827709675, "clip_ratio/high_mean": 0.016144166933372617, "clip_ratio/low_mean": 0.005662594106979668, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.021806761040352285, "completions/clipped_ratio": 0.0, "completions/max_length": 2441.0, "completions/max_terminated_length": 2441.0, "completions/mean_length": 1488.4375, "completions/mean_terminated_length": 1488.4375, "completions/min_length": 1052.0, "completions/min_terminated_length": 1052.0, "entropy": 0.35695891827344894, "epoch": 0.00272, "frac_reward_zero_std": 0.125, "grad_norm": 1.6733559370040894, "kl": 0.020034206565469503, "learning_rate": 5.6571428571428576e-06, "loss": -0.0588, "num_tokens": 2560884.0, "reward": 0.5859375, "reward_std": 0.38607701659202576, "rewards/rollout_reward_func/mean": 0.5859375, "rewards/rollout_reward_func/std": 0.45654281973838806, "sampling/importance_sampling_ratio/max": 1.8220971822738647, "sampling/importance_sampling_ratio/mean": 0.9860107898712158, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9601047039031982, "sampling/sampling_logp_difference/mean": 0.052328821271657944, "step": 34, "step_time": 10.76481853400037 }, { "clip_ratio/high_max": 0.00657894741743803, "clip_ratio/high_mean": 0.003289473708719015, "clip_ratio/low_mean": 0.008878070977516472, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012167544686235487, "completions/clipped_ratio": 0.0, "completions/max_length": 2801.0, "completions/max_terminated_length": 2801.0, "completions/mean_length": 1756.46875, "completions/mean_terminated_length": 1756.46875, "completions/min_length": 1052.0, "completions/min_terminated_length": 1052.0, "entropy": 0.38564804941415787, "epoch": 0.0028, "frac_reward_zero_std": 0.5, "grad_norm": 1.5950710773468018, "kl": 0.0196278584189713, "learning_rate": 5.8285714285714284e-06, "loss": 0.0794, "num_tokens": 2629098.0, "reward": 0.4750000238418579, "reward_std": 0.26933756470680237, "rewards/rollout_reward_func/mean": 0.4750000238418579, "rewards/rollout_reward_func/std": 0.40420371294021606, "sampling/importance_sampling_ratio/max": 2.8944315910339355, "sampling/importance_sampling_ratio/mean": 1.212613582611084, "sampling/importance_sampling_ratio/min": 0.3920697867870331, "sampling/sampling_logp_difference/max": 0.7614344358444214, "sampling/sampling_logp_difference/mean": 0.050811417400836945, "step": 35, "step_time": 12.117880444999855 }, { "clip_ratio/high_max": 0.032855731435120106, "clip_ratio/high_mean": 0.008213932858780026, "clip_ratio/low_mean": 0.008068988332524896, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.016282920725643635, "completions/clipped_ratio": 0.0, "completions/max_length": 2819.0, "completions/max_terminated_length": 2819.0, "completions/mean_length": 2214.375, "completions/mean_terminated_length": 2214.375, "completions/min_length": 1579.0, "completions/min_terminated_length": 1579.0, "entropy": 0.4132639244198799, "epoch": 0.00288, "frac_reward_zero_std": 0.5, "grad_norm": 1.4248710870742798, "kl": 0.04949819762259722, "learning_rate": 6e-06, "loss": -0.1152, "num_tokens": 2713433.0, "reward": 0.3043749928474426, "reward_std": 0.08011817932128906, "rewards/rollout_reward_func/mean": 0.3043749928474426, "rewards/rollout_reward_func/std": 0.16871310770511627, "sampling/importance_sampling_ratio/max": 2.279515504837036, "sampling/importance_sampling_ratio/mean": 1.0208816528320312, "sampling/importance_sampling_ratio/min": 0.2197788804769516, "sampling/sampling_logp_difference/max": 1.5309280157089233, "sampling/sampling_logp_difference/mean": 0.05491582304239273, "step": 36, "step_time": 13.165009270000155 }, { "clip_ratio/high_max": 0.01785714365541935, "clip_ratio/high_mean": 0.004464285913854837, "clip_ratio/low_mean": 0.0022321429569274187, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006696428870782256, "completions/clipped_ratio": 0.0, "completions/max_length": 2803.0, "completions/max_terminated_length": 2803.0, "completions/mean_length": 1736.1875, "completions/mean_terminated_length": 1736.1875, "completions/min_length": 1052.0, "completions/min_terminated_length": 1052.0, "entropy": 0.3515569269657135, "epoch": 0.00296, "frac_reward_zero_std": 0.5, "grad_norm": 1.0670298337936401, "kl": 0.025617226026952267, "learning_rate": 5.999999982184864e-06, "loss": 0.0221, "num_tokens": 2780777.0, "reward": 0.4387500286102295, "reward_std": 0.25966876745224, "rewards/rollout_reward_func/mean": 0.4387500286102295, "rewards/rollout_reward_func/std": 0.3832606077194214, "sampling/importance_sampling_ratio/max": 2.3271644115448, "sampling/importance_sampling_ratio/mean": 1.0649113655090332, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.0678925514221191, "sampling/sampling_logp_difference/mean": 0.05666026473045349, "step": 37, "step_time": 12.593250806000015 }, { "clip_ratio/high_max": 0.028383397962898016, "clip_ratio/high_mean": 0.010161041049286723, "clip_ratio/low_mean": 0.006483843666501343, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.016644884599372745, "completions/clipped_ratio": 0.0, "completions/max_length": 2777.0, "completions/max_terminated_length": 2777.0, "completions/mean_length": 1819.5625, "completions/mean_terminated_length": 1819.5625, "completions/min_length": 1056.0, "completions/min_terminated_length": 1056.0, "entropy": 0.38034912198781967, "epoch": 0.00304, "frac_reward_zero_std": 0.25, "grad_norm": 2.0448880195617676, "kl": 0.04296189732849598, "learning_rate": 5.999999928739459e-06, "loss": -0.0115, "num_tokens": 2851032.0, "reward": 0.6024999618530273, "reward_std": 0.2617889940738678, "rewards/rollout_reward_func/mean": 0.6024999618530273, "rewards/rollout_reward_func/std": 0.44098126888275146, "sampling/importance_sampling_ratio/max": 2.681164503097534, "sampling/importance_sampling_ratio/mean": 1.0418896675109863, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.4294462203979492, "sampling/sampling_logp_difference/mean": 0.0609976202249527, "step": 38, "step_time": 12.55964067500031 }, { "clip_ratio/high_max": 0.047167123295366764, "clip_ratio/high_mean": 0.014736625598743558, "clip_ratio/low_mean": 0.004429678898304701, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01916630449704826, "completions/clipped_ratio": 0.0, "completions/max_length": 2820.0, "completions/max_terminated_length": 2820.0, "completions/mean_length": 2000.0, "completions/mean_terminated_length": 2000.0, "completions/min_length": 1055.0, "completions/min_terminated_length": 1055.0, "entropy": 0.4035666435956955, "epoch": 0.00312, "frac_reward_zero_std": 0.25, "grad_norm": 1.904247760772705, "kl": 0.03608058113604784, "learning_rate": 5.999999839663784e-06, "loss": -0.1975, "num_tokens": 2927712.0, "reward": 0.3853124976158142, "reward_std": 0.1657649129629135, "rewards/rollout_reward_func/mean": 0.3853124976158142, "rewards/rollout_reward_func/std": 0.31012988090515137, "sampling/importance_sampling_ratio/max": 2.3516104221343994, "sampling/importance_sampling_ratio/mean": 0.8599222898483276, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.4187037944793701, "sampling/sampling_logp_difference/mean": 0.05978023633360863, "step": 39, "step_time": 12.440508590000036 }, { "clip_ratio/high_max": 0.04069459065794945, "clip_ratio/high_mean": 0.017941734986379743, "clip_ratio/low_mean": 0.0016447368543595076, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01958647184073925, "completions/clipped_ratio": 0.0, "completions/max_length": 2423.0, "completions/max_terminated_length": 2423.0, "completions/mean_length": 1889.0625, "completions/mean_terminated_length": 1889.0625, "completions/min_length": 1054.0, "completions/min_terminated_length": 1054.0, "entropy": 0.42887038737535477, "epoch": 0.0032, "frac_reward_zero_std": 0.25, "grad_norm": 2.507852077484131, "kl": 0.031137569807469845, "learning_rate": 5.99999971495784e-06, "loss": -0.0375, "num_tokens": 3000212.0, "reward": 0.38593751192092896, "reward_std": 0.16842570900917053, "rewards/rollout_reward_func/mean": 0.38593751192092896, "rewards/rollout_reward_func/std": 0.35313212871551514, "sampling/importance_sampling_ratio/max": 1.8619109392166138, "sampling/importance_sampling_ratio/mean": 0.8876512050628662, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8854889869689941, "sampling/sampling_logp_difference/mean": 0.0671561062335968, "step": 40, "step_time": 11.693177195999851 }, { "clip_ratio/high_max": 0.02651259582489729, "clip_ratio/high_mean": 0.006628148956224322, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00836426008027047, "completions/clipped_ratio": 0.0, "completions/max_length": 2789.0, "completions/max_terminated_length": 2789.0, "completions/mean_length": 2136.03125, "completions/mean_terminated_length": 2136.03125, "completions/min_length": 1053.0, "completions/min_terminated_length": 1053.0, "entropy": 0.42095063626766205, "epoch": 0.00328, "frac_reward_zero_std": 0.625, "grad_norm": 1.2850134372711182, "kl": 0.039208856876939535, "learning_rate": 5.99999955462163e-06, "loss": -0.0237, "num_tokens": 3081651.0, "reward": 0.3506249785423279, "reward_std": 0.1440507173538208, "rewards/rollout_reward_func/mean": 0.3506249785423279, "rewards/rollout_reward_func/std": 0.2683153748512268, "sampling/importance_sampling_ratio/max": 2.8166987895965576, "sampling/importance_sampling_ratio/mean": 1.0108704566955566, "sampling/importance_sampling_ratio/min": 0.14420194923877716, "sampling/sampling_logp_difference/max": 1.127936840057373, "sampling/sampling_logp_difference/mean": 0.06519916653633118, "step": 41, "step_time": 14.135176596000292 }, { "clip_ratio/high_max": 0.03996024373918772, "clip_ratio/high_mean": 0.012911256635561585, "clip_ratio/low_mean": 0.004817708395421505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01772896503098309, "completions/clipped_ratio": 0.0, "completions/max_length": 2767.0, "completions/max_terminated_length": 2767.0, "completions/mean_length": 1934.65625, "completions/mean_terminated_length": 1934.65625, "completions/min_length": 1055.0, "completions/min_terminated_length": 1055.0, "entropy": 0.38335342705249786, "epoch": 0.00336, "frac_reward_zero_std": 0.25, "grad_norm": 2.1722676753997803, "kl": 0.13585597835481167, "learning_rate": 5.999999358655157e-06, "loss": -0.2418, "num_tokens": 3156023.0, "reward": 0.3475000262260437, "reward_std": 0.21655070781707764, "rewards/rollout_reward_func/mean": 0.3475000262260437, "rewards/rollout_reward_func/std": 0.3131937086582184, "sampling/importance_sampling_ratio/max": 2.6130497455596924, "sampling/importance_sampling_ratio/mean": 0.8806287050247192, "sampling/importance_sampling_ratio/min": 0.16678351163864136, "sampling/sampling_logp_difference/max": 2.3499860763549805, "sampling/sampling_logp_difference/mean": 0.06342820823192596, "step": 42, "step_time": 13.112628190999885 }, { "clip_ratio/high_max": 0.021321472711861134, "clip_ratio/high_mean": 0.007562511134892702, "clip_ratio/low_mean": 0.0038768798112869263, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011439391179010272, "completions/clipped_ratio": 0.0, "completions/max_length": 2797.0, "completions/max_terminated_length": 2797.0, "completions/mean_length": 1674.3125, "completions/mean_terminated_length": 1674.3125, "completions/min_length": 1054.0, "completions/min_terminated_length": 1054.0, "entropy": 0.3885280713438988, "epoch": 0.00344, "frac_reward_zero_std": 0.5, "grad_norm": 1.4752204418182373, "kl": 0.036413189955055714, "learning_rate": 5.999999127058423e-06, "loss": 0.0258, "num_tokens": 3221611.0, "reward": 0.6737500429153442, "reward_std": 0.25966876745224, "rewards/rollout_reward_func/mean": 0.6737500429153442, "rewards/rollout_reward_func/std": 0.4556862711906433, "sampling/importance_sampling_ratio/max": 2.9477226734161377, "sampling/importance_sampling_ratio/mean": 1.1396255493164062, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.167872667312622, "sampling/sampling_logp_difference/mean": 0.06658157706260681, "step": 43, "step_time": 12.020586962000152 }, { "clip_ratio/high_max": 0.036011905409395695, "clip_ratio/high_mean": 0.010423430823720992, "clip_ratio/low_mean": 0.0030159883899614215, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013439419795759022, "completions/clipped_ratio": 0.0, "completions/max_length": 2814.0, "completions/max_terminated_length": 2814.0, "completions/mean_length": 2095.75, "completions/mean_terminated_length": 2095.75, "completions/min_length": 1568.0, "completions/min_terminated_length": 1568.0, "entropy": 0.39560940861701965, "epoch": 0.00352, "frac_reward_zero_std": 0.125, "grad_norm": 1.8694807291030884, "kl": 0.1402588039636612, "learning_rate": 5.999998859831431e-06, "loss": -0.1597, "num_tokens": 3301324.0, "reward": 0.40437501668930054, "reward_std": 0.2259407639503479, "rewards/rollout_reward_func/mean": 0.40437501668930054, "rewards/rollout_reward_func/std": 0.35422733426094055, "sampling/importance_sampling_ratio/max": 2.6974401473999023, "sampling/importance_sampling_ratio/mean": 0.8676252365112305, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.911269187927246, "sampling/sampling_logp_difference/mean": 0.08191373944282532, "step": 44, "step_time": 12.868387047999704 }, { "clip_ratio/high_max": 0.0369886364787817, "clip_ratio/high_mean": 0.011032873298972845, "clip_ratio/low_mean": 0.0043535883305594325, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015386461513116956, "completions/clipped_ratio": 0.0, "completions/max_length": 2793.0, "completions/max_terminated_length": 2793.0, "completions/mean_length": 2412.75, "completions/mean_terminated_length": 2412.75, "completions/min_length": 1056.0, "completions/min_terminated_length": 1056.0, "entropy": 0.4342958629131317, "epoch": 0.0036, "frac_reward_zero_std": 0.375, "grad_norm": 1.422937035560608, "kl": 0.11194289568811655, "learning_rate": 5.999998556974188e-06, "loss": -0.1586, "num_tokens": 3392626.0, "reward": 0.35750001668930054, "reward_std": 0.0949999988079071, "rewards/rollout_reward_func/mean": 0.35750001668930054, "rewards/rollout_reward_func/std": 0.260532945394516, "sampling/importance_sampling_ratio/max": 2.1776068210601807, "sampling/importance_sampling_ratio/mean": 0.852668285369873, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 3.2368037700653076, "sampling/sampling_logp_difference/mean": 0.07167594134807587, "step": 45, "step_time": 13.40532306199998 }, { "clip_ratio/high_max": 0.036038962192833424, "clip_ratio/high_mean": 0.012058520689606667, "clip_ratio/low_mean": 0.0017857142956927419, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013844234868884087, "completions/clipped_ratio": 0.0, "completions/max_length": 2784.0, "completions/max_terminated_length": 2784.0, "completions/mean_length": 2018.40625, "completions/mean_terminated_length": 2018.40625, "completions/min_length": 1053.0, "completions/min_terminated_length": 1053.0, "entropy": 0.3769753500819206, "epoch": 0.00368, "frac_reward_zero_std": 0.5, "grad_norm": 6.942874908447266, "kl": 0.8322499115020037, "learning_rate": 5.999998218486697e-06, "loss": -0.0692, "num_tokens": 3469989.0, "reward": 0.39250001311302185, "reward_std": 0.14825798571109772, "rewards/rollout_reward_func/mean": 0.39250001311302185, "rewards/rollout_reward_func/std": 0.29918164014816284, "sampling/importance_sampling_ratio/max": 2.446554660797119, "sampling/importance_sampling_ratio/mean": 0.8061342239379883, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 3.015519142150879, "sampling/sampling_logp_difference/mean": 0.07696790993213654, "step": 46, "step_time": 12.274703390000013 }, { "clip_ratio/high_max": 0.04237867519259453, "clip_ratio/high_mean": 0.01807057624682784, "clip_ratio/low_mean": 0.005178963067010045, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.023249539081007242, "completions/clipped_ratio": 0.0, "completions/max_length": 2800.0, "completions/max_terminated_length": 2800.0, "completions/mean_length": 1802.75, "completions/mean_terminated_length": 1802.75, "completions/min_length": 1053.0, "completions/min_terminated_length": 1053.0, "entropy": 0.38206612318754196, "epoch": 0.00376, "frac_reward_zero_std": 0.375, "grad_norm": 1.6050693988800049, "kl": 0.05531273875385523, "learning_rate": 5.999997844368963e-06, "loss": -0.0113, "num_tokens": 3540097.0, "reward": 0.4990624785423279, "reward_std": 0.28371256589889526, "rewards/rollout_reward_func/mean": 0.4990624785423279, "rewards/rollout_reward_func/std": 0.41065138578414917, "sampling/importance_sampling_ratio/max": 1.9599696397781372, "sampling/importance_sampling_ratio/mean": 0.8884379863739014, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8157303333282471, "sampling/sampling_logp_difference/mean": 0.06130218505859375, "step": 47, "step_time": 12.33328224800016 }, { "clip_ratio/high_max": 0.0206808946095407, "clip_ratio/high_mean": 0.005170223652385175, "clip_ratio/low_mean": 0.004861111170612276, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01003133482299745, "completions/clipped_ratio": 0.0, "completions/max_length": 2813.0, "completions/max_terminated_length": 2813.0, "completions/mean_length": 1976.21875, "completions/mean_terminated_length": 1976.21875, "completions/min_length": 1054.0, "completions/min_terminated_length": 1054.0, "entropy": 0.37976498901844025, "epoch": 0.00384, "frac_reward_zero_std": 0.5, "grad_norm": 1.5906230211257935, "kl": 0.11688470654189587, "learning_rate": 5.999997434620992e-06, "loss": -0.1357, "num_tokens": 3616089.0, "reward": 0.437812477350235, "reward_std": 0.20705953240394592, "rewards/rollout_reward_func/mean": 0.437812477350235, "rewards/rollout_reward_func/std": 0.35220715403556824, "sampling/importance_sampling_ratio/max": 1.8663876056671143, "sampling/importance_sampling_ratio/mean": 0.8626433610916138, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.65832781791687, "sampling/sampling_logp_difference/mean": 0.06971758604049683, "step": 48, "step_time": 12.541744299999891 }, { "clip_ratio/high_max": 0.012820512987673283, "clip_ratio/high_mean": 0.0032051282469183207, "clip_ratio/low_mean": 0.0014534883666783571, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004658616613596678, "completions/clipped_ratio": 0.0, "completions/max_length": 2807.0, "completions/max_terminated_length": 2807.0, "completions/mean_length": 2245.15625, "completions/mean_terminated_length": 2245.15625, "completions/min_length": 1551.0, "completions/min_terminated_length": 1551.0, "entropy": 0.4284479096531868, "epoch": 0.00392, "frac_reward_zero_std": 0.75, "grad_norm": 1.0771631002426147, "kl": 0.046674114651978016, "learning_rate": 5.999996989242791e-06, "loss": -0.0014, "num_tokens": 3701038.0, "reward": 0.42624998092651367, "reward_std": 0.13466876745224, "rewards/rollout_reward_func/mean": 0.42624998092651367, "rewards/rollout_reward_func/std": 0.3314265012741089, "sampling/importance_sampling_ratio/max": 1.4823979139328003, "sampling/importance_sampling_ratio/mean": 0.8060042858123779, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.11665940284729, "sampling/sampling_logp_difference/mean": 0.0697537213563919, "step": 49, "step_time": 13.132170692999807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2787.0, "completions/max_terminated_length": 2787.0, "completions/mean_length": 2460.71875, "completions/mean_terminated_length": 2460.71875, "completions/min_length": 2034.0, "completions/min_terminated_length": 2034.0, "entropy": 0.4269709587097168, "epoch": 0.004, "frac_reward_zero_std": 1.0, "grad_norm": 0.06742172688245773, "kl": 0.05730041675269604, "learning_rate": 5.999996508234369e-06, "loss": 0.0008, "num_tokens": 3793655.0, "reward": 0.30000001192092896, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.30000001192092896, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.4013469219207764, "sampling/importance_sampling_ratio/mean": 0.8128387928009033, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.065826416015625, "sampling/sampling_logp_difference/mean": 0.07448764890432358, "step": 50, "step_time": 13.017520340999681 }, { "clip_ratio/high_max": 0.03630952490493655, "clip_ratio/high_mean": 0.012549603707157075, "clip_ratio/low_mean": 0.0031565657118335366, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015706169069744647, "completions/clipped_ratio": 0.0, "completions/max_length": 2785.0, "completions/max_terminated_length": 2785.0, "completions/mean_length": 1773.78125, "completions/mean_terminated_length": 1773.78125, "completions/min_length": 1054.0, "completions/min_terminated_length": 1054.0, "entropy": 0.37685880810022354, "epoch": 0.00408, "frac_reward_zero_std": 0.625, "grad_norm": 1.299412727355957, "kl": 0.04002719838172197, "learning_rate": 5.999995991595729e-06, "loss": -0.0109, "num_tokens": 3862448.0, "reward": 0.5353125333786011, "reward_std": 0.08654377609491348, "rewards/rollout_reward_func/mean": 0.5353125333786011, "rewards/rollout_reward_func/std": 0.41608762741088867, "sampling/importance_sampling_ratio/max": 2.3817224502563477, "sampling/importance_sampling_ratio/mean": 0.9811595678329468, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.0328466892242432, "sampling/sampling_logp_difference/mean": 0.06913870573043823, "step": 51, "step_time": 12.60499654799969 }, { "clip_ratio/high_max": 0.03819444449618459, "clip_ratio/high_mean": 0.015144050237722695, "clip_ratio/low_mean": 0.00554396363440901, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02068801363930106, "completions/clipped_ratio": 0.0, "completions/max_length": 2784.0, "completions/max_terminated_length": 2784.0, "completions/mean_length": 1786.09375, "completions/mean_terminated_length": 1786.09375, "completions/min_length": 1053.0, "completions/min_terminated_length": 1053.0, "entropy": 0.38082515448331833, "epoch": 0.00416, "frac_reward_zero_std": 0.375, "grad_norm": 1.5871905088424683, "kl": 0.06744291074573994, "learning_rate": 5.999995439326883e-06, "loss": -0.0699, "num_tokens": 3931876.0, "reward": 0.6090624928474426, "reward_std": 0.26599711179733276, "rewards/rollout_reward_func/mean": 0.6090624928474426, "rewards/rollout_reward_func/std": 0.4591953456401825, "sampling/importance_sampling_ratio/max": 2.734297752380371, "sampling/importance_sampling_ratio/mean": 0.9665597677230835, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.065897226333618, "sampling/sampling_logp_difference/mean": 0.06354629993438721, "step": 52, "step_time": 13.568785395000077 }, { "clip_ratio/high_max": 0.022086466662585735, "clip_ratio/high_mean": 0.008820227812975645, "clip_ratio/low_mean": 0.0057043652050197124, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01452459313441068, "completions/clipped_ratio": 0.0, "completions/max_length": 2789.0, "completions/max_terminated_length": 2789.0, "completions/mean_length": 1635.03125, "completions/mean_terminated_length": 1635.03125, "completions/min_length": 1053.0, "completions/min_terminated_length": 1053.0, "entropy": 0.36858493834733963, "epoch": 0.00424, "frac_reward_zero_std": 0.375, "grad_norm": 1.9566222429275513, "kl": 0.07718627620488405, "learning_rate": 5.999994851427837e-06, "loss": 0.0822, "num_tokens": 3995868.0, "reward": 0.6918749809265137, "reward_std": 0.3203721046447754, "rewards/rollout_reward_func/mean": 0.6918749809265137, "rewards/rollout_reward_func/std": 0.4697249233722687, "sampling/importance_sampling_ratio/max": 2.7838289737701416, "sampling/importance_sampling_ratio/mean": 0.9136906266212463, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.7872750759124756, "sampling/sampling_logp_difference/mean": 0.07199069857597351, "step": 53, "step_time": 12.247058065999909 }, { "clip_ratio/high_max": 0.041652148589491844, "clip_ratio/high_mean": 0.013425522716715932, "clip_ratio/low_mean": 0.01002952002454549, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234550426248461, "completions/clipped_ratio": 0.0, "completions/max_length": 2784.0, "completions/max_terminated_length": 2784.0, "completions/mean_length": 1590.09375, "completions/mean_terminated_length": 1590.09375, "completions/min_length": 1053.0, "completions/min_terminated_length": 1053.0, "entropy": 0.37869949638843536, "epoch": 0.00432, "frac_reward_zero_std": 0.125, "grad_norm": 2.3980886936187744, "kl": 0.05773049034178257, "learning_rate": 5.999994227898604e-06, "loss": -0.0192, "num_tokens": 4058303.0, "reward": 0.4609374701976776, "reward_std": 0.35279375314712524, "rewards/rollout_reward_func/mean": 0.4609374701976776, "rewards/rollout_reward_func/std": 0.44058871269226074, "sampling/importance_sampling_ratio/max": 2.2311129570007324, "sampling/importance_sampling_ratio/mean": 0.9393452405929565, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9994411468505859, "sampling/sampling_logp_difference/mean": 0.08165294677019119, "step": 54, "step_time": 11.528886767999893 }, { "clip_ratio/high_max": 0.02447916753590107, "clip_ratio/high_mean": 0.0075732802506536245, "clip_ratio/low_mean": 0.00947712454944849, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01705040503293276, "completions/clipped_ratio": 0.0, "completions/max_length": 2444.0, "completions/max_terminated_length": 2444.0, "completions/mean_length": 1789.75, "completions/mean_terminated_length": 1789.75, "completions/min_length": 1055.0, "completions/min_terminated_length": 1055.0, "entropy": 0.36519913375377655, "epoch": 0.0044, "frac_reward_zero_std": 0.375, "grad_norm": 2.2972187995910645, "kl": 0.05506392475217581, "learning_rate": 5.99999356873919e-06, "loss": -0.1185, "num_tokens": 4127411.0, "reward": 0.40562498569488525, "reward_std": 0.22391541302204132, "rewards/rollout_reward_func/mean": 0.40562498569488525, "rewards/rollout_reward_func/std": 0.3422500193119049, "sampling/importance_sampling_ratio/max": 2.4115021228790283, "sampling/importance_sampling_ratio/mean": 0.9461013674736023, "sampling/importance_sampling_ratio/min": 0.14957794547080994, "sampling/sampling_logp_difference/max": 1.0122857093811035, "sampling/sampling_logp_difference/mean": 0.06259442120790482, "step": 55, "step_time": 11.493340414999693 }, { "clip_ratio/high_max": 0.0438775522634387, "clip_ratio/high_mean": 0.012457483448088169, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012457483448088169, "completions/clipped_ratio": 0.0, "completions/max_length": 2799.0, "completions/max_terminated_length": 2799.0, "completions/mean_length": 2306.375, "completions/mean_terminated_length": 2306.375, "completions/min_length": 1567.0, "completions/min_terminated_length": 1567.0, "entropy": 0.40949854254722595, "epoch": 0.00448, "frac_reward_zero_std": 0.375, "grad_norm": 2.023374319076538, "kl": 0.08688413165509701, "learning_rate": 5.999992873949609e-06, "loss": -0.0712, "num_tokens": 4214487.0, "reward": 0.296875, "reward_std": 0.08874999731779099, "rewards/rollout_reward_func/mean": 0.296875, "rewards/rollout_reward_func/std": 0.15228237211704254, "sampling/importance_sampling_ratio/max": 2.9896316528320312, "sampling/importance_sampling_ratio/mean": 0.968756377696991, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.835113763809204, "sampling/sampling_logp_difference/mean": 0.08090537041425705, "step": 56, "step_time": 13.222404719000224 }, { "clip_ratio/high_max": 0.04506416339427233, "clip_ratio/high_mean": 0.01424223161302507, "clip_ratio/low_mean": 0.002842377289198339, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01708460901863873, "completions/clipped_ratio": 0.0, "completions/max_length": 2441.0, "completions/max_terminated_length": 2441.0, "completions/mean_length": 1963.0, "completions/mean_terminated_length": 1963.0, "completions/min_length": 1052.0, "completions/min_terminated_length": 1052.0, "entropy": 0.4080217182636261, "epoch": 0.00456, "frac_reward_zero_std": 0.375, "grad_norm": 1.663516640663147, "kl": 0.3106076046824455, "learning_rate": 5.999992143529868e-06, "loss": -0.0796, "num_tokens": 4289619.0, "reward": 0.3934375047683716, "reward_std": 0.1563829779624939, "rewards/rollout_reward_func/mean": 0.3934375047683716, "rewards/rollout_reward_func/std": 0.30592650175094604, "sampling/importance_sampling_ratio/max": 1.4833005666732788, "sampling/importance_sampling_ratio/mean": 0.5795140862464905, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 3.2869999408721924, "sampling/sampling_logp_difference/mean": 0.0979442298412323, "step": 57, "step_time": 11.762371722000125 }, { "clip_ratio/high_max": 0.046875, "clip_ratio/high_mean": 0.01171875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01171875, "completions/clipped_ratio": 0.0, "completions/max_length": 2802.0, "completions/max_terminated_length": 2802.0, "completions/mean_length": 1879.75, "completions/mean_terminated_length": 1879.75, "completions/min_length": 1053.0, "completions/min_terminated_length": 1053.0, "entropy": 0.3892976716160774, "epoch": 0.00464, "frac_reward_zero_std": 0.625, "grad_norm": 1.1009422540664673, "kl": 0.05097049381583929, "learning_rate": 5.999991377479982e-06, "loss": -0.0191, "num_tokens": 4362090.0, "reward": 0.5262500047683716, "reward_std": 0.1875, "rewards/rollout_reward_func/mean": 0.5262500047683716, "rewards/rollout_reward_func/std": 0.4009806215763092, "sampling/importance_sampling_ratio/max": 2.9964590072631836, "sampling/importance_sampling_ratio/mean": 1.0218505859375, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.0378296375274658, "sampling/sampling_logp_difference/mean": 0.06926104426383972, "step": 58, "step_time": 13.15352440300012 }, { "clip_ratio/high_max": 0.019571688026189804, "clip_ratio/high_mean": 0.004892922006547451, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004892922006547451, "completions/clipped_ratio": 0.0, "completions/max_length": 2832.0, "completions/max_terminated_length": 2832.0, "completions/mean_length": 2202.375, "completions/mean_terminated_length": 2202.375, "completions/min_length": 1052.0, "completions/min_terminated_length": 1052.0, "entropy": 0.41450754553079605, "epoch": 0.00472, "frac_reward_zero_std": 0.625, "grad_norm": 2.1054370403289795, "kl": 0.03926007356494665, "learning_rate": 5.999990575799961e-06, "loss": 0.0595, "num_tokens": 4446012.0, "reward": 0.44343751668930054, "reward_std": 0.13312500715255737, "rewards/rollout_reward_func/mean": 0.44343751668930054, "rewards/rollout_reward_func/std": 0.3428213894367218, "sampling/importance_sampling_ratio/max": 2.236393928527832, "sampling/importance_sampling_ratio/mean": 0.8590089678764343, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9236248731613159, "sampling/sampling_logp_difference/mean": 0.06904841959476471, "step": 59, "step_time": 13.58062329900008 }, { "clip_ratio/high_max": 0.029240576550364494, "clip_ratio/high_mean": 0.007310144137591124, "clip_ratio/low_mean": 0.0030868902103975415, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010397034231573343, "completions/clipped_ratio": 0.0, "completions/max_length": 2429.0, "completions/max_terminated_length": 2429.0, "completions/mean_length": 2124.9375, "completions/mean_terminated_length": 2124.9375, "completions/min_length": 1567.0, "completions/min_terminated_length": 1567.0, "entropy": 0.370839923620224, "epoch": 0.0048, "frac_reward_zero_std": 0.75, "grad_norm": 0.5435988903045654, "kl": 0.08518982026726007, "learning_rate": 5.99998973848982e-06, "loss": -0.0579, "num_tokens": 4527051.0, "reward": 0.3590624928474426, "reward_std": 0.06796419620513916, "rewards/rollout_reward_func/mean": 0.3590624928474426, "rewards/rollout_reward_func/std": 0.22809672355651855, "sampling/importance_sampling_ratio/max": 2.2381787300109863, "sampling/importance_sampling_ratio/mean": 0.8449472188949585, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.6413207054138184, "sampling/sampling_logp_difference/mean": 0.069917693734169, "step": 60, "step_time": 11.73221161399988 }, { "clip_ratio/high_max": 0.02281746082007885, "clip_ratio/high_mean": 0.006954365293495357, "clip_ratio/low_mean": 0.0037499999161809683, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010704364976845682, "completions/clipped_ratio": 0.0, "completions/max_length": 2767.0, "completions/max_terminated_length": 2767.0, "completions/mean_length": 1689.3125, "completions/mean_terminated_length": 1689.3125, "completions/min_length": 1052.0, "completions/min_terminated_length": 1052.0, "entropy": 0.3677019253373146, "epoch": 0.00488, "frac_reward_zero_std": 0.375, "grad_norm": 1.9829214811325073, "kl": 0.058779667131602764, "learning_rate": 5.999988865549569e-06, "loss": 0.0304, "num_tokens": 4593095.0, "reward": 0.6549999713897705, "reward_std": 0.22813192009925842, "rewards/rollout_reward_func/mean": 0.6549999713897705, "rewards/rollout_reward_func/std": 0.45271220803260803, "sampling/importance_sampling_ratio/max": 1.883159875869751, "sampling/importance_sampling_ratio/mean": 0.8463116884231567, "sampling/importance_sampling_ratio/min": 0.22824469208717346, "sampling/sampling_logp_difference/max": 1.781625747680664, "sampling/sampling_logp_difference/mean": 0.06828776746988297, "step": 61, "step_time": 12.480462479000153 }, { "clip_ratio/high_max": 0.0110975606366992, "clip_ratio/high_mean": 0.0027743901591748, "clip_ratio/low_mean": 0.0014880952658131719, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004262485424987972, "completions/clipped_ratio": 0.0, "completions/max_length": 2815.0, "completions/max_terminated_length": 2815.0, "completions/mean_length": 2305.15625, "completions/mean_terminated_length": 2305.15625, "completions/min_length": 1571.0, "completions/min_terminated_length": 1571.0, "entropy": 0.4012472406029701, "epoch": 0.00496, "frac_reward_zero_std": 0.75, "grad_norm": 0.8583229780197144, "kl": 0.06238031107932329, "learning_rate": 5.999987956979225e-06, "loss": -0.0392, "num_tokens": 4680377.0, "reward": 0.3434374928474426, "reward_std": 0.08029377460479736, "rewards/rollout_reward_func/mean": 0.3434374928474426, "rewards/rollout_reward_func/std": 0.22245851159095764, "sampling/importance_sampling_ratio/max": 2.7665059566497803, "sampling/importance_sampling_ratio/mean": 0.9882571697235107, "sampling/importance_sampling_ratio/min": 0.059778764843940735, "sampling/sampling_logp_difference/max": 0.9543299674987793, "sampling/sampling_logp_difference/mean": 0.0674777626991272, "step": 62, "step_time": 14.56621960199982 }, { "clip_ratio/high_max": 0.05253623379394412, "clip_ratio/high_mean": 0.016606280929408967, "clip_ratio/low_mean": 0.0018382353009656072, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018444516230374575, "completions/clipped_ratio": 0.0, "completions/max_length": 2423.0, "completions/max_terminated_length": 2423.0, "completions/mean_length": 1968.9375, "completions/mean_terminated_length": 1968.9375, "completions/min_length": 1052.0, "completions/min_terminated_length": 1052.0, "entropy": 0.4196172505617142, "epoch": 0.00504, "frac_reward_zero_std": 0.5, "grad_norm": 0.905213475227356, "kl": 0.09828684013336897, "learning_rate": 5.999987012778799e-06, "loss": -0.0034, "num_tokens": 4755993.0, "reward": 0.33031249046325684, "reward_std": 0.09654378145933151, "rewards/rollout_reward_func/mean": 0.33031249046325684, "rewards/rollout_reward_func/std": 0.21877197921276093, "sampling/importance_sampling_ratio/max": 2.0344085693359375, "sampling/importance_sampling_ratio/mean": 0.7417819499969482, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.208054542541504, "sampling/sampling_logp_difference/mean": 0.07868118584156036, "step": 63, "step_time": 11.736785162999922 }, { "clip_ratio/high_max": 0.0055555556900799274, "clip_ratio/high_mean": 0.0027777778450399637, "clip_ratio/low_mean": 0.003794643096625805, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006572420941665769, "completions/clipped_ratio": 0.0, "completions/max_length": 2814.0, "completions/max_terminated_length": 2814.0, "completions/mean_length": 1972.9375, "completions/mean_terminated_length": 1972.9375, "completions/min_length": 1053.0, "completions/min_terminated_length": 1053.0, "entropy": 0.4023704081773758, "epoch": 0.00512, "frac_reward_zero_std": 0.5, "grad_norm": 1.3090136051177979, "kl": 0.12237261980772018, "learning_rate": 5.9999860329483104e-06, "loss": -0.194, "num_tokens": 4831827.0, "reward": 0.5878125429153442, "reward_std": 0.142506942152977, "rewards/rollout_reward_func/mean": 0.5878125429153442, "rewards/rollout_reward_func/std": 0.4488244950771332, "sampling/importance_sampling_ratio/max": 2.9826838970184326, "sampling/importance_sampling_ratio/mean": 0.9495848417282104, "sampling/importance_sampling_ratio/min": 0.01580546610057354, "sampling/sampling_logp_difference/max": 2.0490379333496094, "sampling/sampling_logp_difference/mean": 0.07465916872024536, "step": 64, "step_time": 12.846850890000042 }, { "clip_ratio/high_max": 0.021152781788259745, "clip_ratio/high_mean": 0.006812585634179413, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006812585634179413, "completions/clipped_ratio": 0.0, "completions/max_length": 2800.0, "completions/max_terminated_length": 2800.0, "completions/mean_length": 2005.0, "completions/mean_terminated_length": 2005.0, "completions/min_length": 1054.0, "completions/min_terminated_length": 1054.0, "entropy": 0.40342626720666885, "epoch": 0.0052, "frac_reward_zero_std": 0.625, "grad_norm": 1.2556148767471313, "kl": 0.0746797863394022, "learning_rate": 5.999985017487771e-06, "loss": -0.0305, "num_tokens": 4908716.0, "reward": 0.3818749785423279, "reward_std": 0.15371949970722198, "rewards/rollout_reward_func/mean": 0.3818749785423279, "rewards/rollout_reward_func/std": 0.3037022650241852, "sampling/importance_sampling_ratio/max": 2.2475836277008057, "sampling/importance_sampling_ratio/mean": 0.8594139814376831, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.4058151245117188, "sampling/sampling_logp_difference/mean": 0.06716296076774597, "step": 65, "step_time": 12.423915739000222 }, { "clip_ratio/high_max": 0.03018707549199462, "clip_ratio/high_mean": 0.007546768872998655, "clip_ratio/low_mean": 0.006225198740139604, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013771967613138258, "completions/clipped_ratio": 0.0, "completions/max_length": 2792.0, "completions/max_terminated_length": 2792.0, "completions/mean_length": 2048.40625, "completions/mean_terminated_length": 2048.40625, "completions/min_length": 1055.0, "completions/min_terminated_length": 1055.0, "entropy": 0.38859760761260986, "epoch": 0.00528, "frac_reward_zero_std": 0.375, "grad_norm": 2.1451351642608643, "kl": 0.22367357090115547, "learning_rate": 5.999983966397197e-06, "loss": -0.1677, "num_tokens": 4987207.0, "reward": 0.4712499976158142, "reward_std": 0.18434235453605652, "rewards/rollout_reward_func/mean": 0.4712499976158142, "rewards/rollout_reward_func/std": 0.35322248935699463, "sampling/importance_sampling_ratio/max": 2.9422554969787598, "sampling/importance_sampling_ratio/mean": 0.9915428161621094, "sampling/importance_sampling_ratio/min": 0.016011416912078857, "sampling/sampling_logp_difference/max": 2.497363805770874, "sampling/sampling_logp_difference/mean": 0.07474374771118164, "step": 66, "step_time": 12.926716641999974 }, { "clip_ratio/high_max": 0.01376319769769907, "clip_ratio/high_mean": 0.005043363547883928, "clip_ratio/low_mean": 0.0014880952658131719, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0065314588136971, "completions/clipped_ratio": 0.0, "completions/max_length": 2783.0, "completions/max_terminated_length": 2783.0, "completions/mean_length": 1772.40625, "completions/mean_terminated_length": 1772.40625, "completions/min_length": 1055.0, "completions/min_terminated_length": 1055.0, "entropy": 0.4034885838627815, "epoch": 0.00536, "frac_reward_zero_std": 0.625, "grad_norm": 1.5488033294677734, "kl": 0.05104802828282118, "learning_rate": 5.999982879676608e-06, "loss": -0.041, "num_tokens": 5055760.0, "reward": 0.5737500190734863, "reward_std": 0.16325795650482178, "rewards/rollout_reward_func/mean": 0.5737500190734863, "rewards/rollout_reward_func/std": 0.40885162353515625, "sampling/importance_sampling_ratio/max": 2.278799057006836, "sampling/importance_sampling_ratio/mean": 0.99635910987854, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8835396766662598, "sampling/sampling_logp_difference/mean": 0.06713330745697021, "step": 67, "step_time": 12.46692701400002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004360465100035071, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004360465100035071, "completions/clipped_ratio": 0.0, "completions/max_length": 2796.0, "completions/max_terminated_length": 2796.0, "completions/mean_length": 2270.40625, "completions/mean_terminated_length": 2270.40625, "completions/min_length": 1983.0, "completions/min_terminated_length": 1983.0, "entropy": 0.4274456053972244, "epoch": 0.00544, "frac_reward_zero_std": 0.75, "grad_norm": 0.8523308038711548, "kl": 0.13869191519916058, "learning_rate": 5.9999817573260195e-06, "loss": -0.1124, "num_tokens": 5141713.0, "reward": 0.2878125011920929, "reward_std": 0.01750694215297699, "rewards/rollout_reward_func/mean": 0.2878125011920929, "rewards/rollout_reward_func/std": 0.03849880024790764, "sampling/importance_sampling_ratio/max": 2.765258550643921, "sampling/importance_sampling_ratio/mean": 0.8237208127975464, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.7204997539520264, "sampling/sampling_logp_difference/mean": 0.08386299759149551, "step": 68, "step_time": 12.989918530000296 }, { "clip_ratio/high_max": 0.04849738674238324, "clip_ratio/high_mean": 0.016489425906911492, "clip_ratio/low_mean": 0.0013888889225199819, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.017878314713016152, "completions/clipped_ratio": 0.0, "completions/max_length": 2441.0, "completions/max_terminated_length": 2441.0, "completions/mean_length": 1876.46875, "completions/mean_terminated_length": 1876.46875, "completions/min_length": 1054.0, "completions/min_terminated_length": 1054.0, "entropy": 0.39091238379478455, "epoch": 0.00552, "frac_reward_zero_std": 0.625, "grad_norm": 1.4318950176239014, "kl": 0.09389345720410347, "learning_rate": 5.999980599345448e-06, "loss": -0.0356, "num_tokens": 5214177.0, "reward": 0.5806249976158142, "reward_std": 0.07874999940395355, "rewards/rollout_reward_func/mean": 0.5806249976158142, "rewards/rollout_reward_func/std": 0.4241190552711487, "sampling/importance_sampling_ratio/max": 1.9308501482009888, "sampling/importance_sampling_ratio/mean": 0.9257422089576721, "sampling/importance_sampling_ratio/min": 0.21397040784358978, "sampling/sampling_logp_difference/max": 1.7655794620513916, "sampling/sampling_logp_difference/mean": 0.06729073822498322, "step": 69, "step_time": 11.706464537000102 }, { "clip_ratio/high_max": 0.012202381156384945, "clip_ratio/high_mean": 0.0030505952890962362, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0030505952890962362, "completions/clipped_ratio": 0.0, "completions/max_length": 2429.0, "completions/max_terminated_length": 2429.0, "completions/mean_length": 1901.125, "completions/mean_terminated_length": 1901.125, "completions/min_length": 1056.0, "completions/min_terminated_length": 1056.0, "entropy": 0.4174434766173363, "epoch": 0.0056, "frac_reward_zero_std": 0.625, "grad_norm": 1.2482589483261108, "kl": 0.0709009887650609, "learning_rate": 5.999979405734914e-06, "loss": -0.0875, "num_tokens": 5287259.0, "reward": 0.44999998807907104, "reward_std": 0.19828803837299347, "rewards/rollout_reward_func/mean": 0.44999998807907104, "rewards/rollout_reward_func/std": 0.36232221126556396, "sampling/importance_sampling_ratio/max": 2.240993022918701, "sampling/importance_sampling_ratio/mean": 0.7815386652946472, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.9302306175231934, "sampling/sampling_logp_difference/mean": 0.07109043747186661, "step": 70, "step_time": 11.92579563199979 }, { "clip_ratio/high_max": 0.03573596617206931, "clip_ratio/high_mean": 0.011878836317919195, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01361494732555002, "completions/clipped_ratio": 0.0, "completions/max_length": 2800.0, "completions/max_terminated_length": 2800.0, "completions/mean_length": 2164.34375, "completions/mean_terminated_length": 2164.34375, "completions/min_length": 1055.0, "completions/min_terminated_length": 1055.0, "entropy": 0.4176176115870476, "epoch": 0.00568, "frac_reward_zero_std": 0.375, "grad_norm": 1.604291319847107, "kl": 0.0694936579093337, "learning_rate": 5.999978176494435e-06, "loss": -0.1233, "num_tokens": 5369772.0, "reward": 0.4762499928474426, "reward_std": 0.21075797080993652, "rewards/rollout_reward_func/mean": 0.4762499928474426, "rewards/rollout_reward_func/std": 0.3796156644821167, "sampling/importance_sampling_ratio/max": 2.497992992401123, "sampling/importance_sampling_ratio/mean": 0.8422503471374512, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.5480012893676758, "sampling/sampling_logp_difference/mean": 0.0728713721036911, "step": 71, "step_time": 12.894382659999792 }, { "clip_ratio/high_max": 0.02281746082007885, "clip_ratio/high_mean": 0.008831319864839315, "clip_ratio/low_mean": 0.003260501311160624, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012091821059584618, "completions/clipped_ratio": 0.0, "completions/max_length": 2443.0, "completions/max_terminated_length": 2443.0, "completions/mean_length": 1719.03125, "completions/mean_terminated_length": 1719.03125, "completions/min_length": 1054.0, "completions/min_terminated_length": 1054.0, "entropy": 0.39671653509140015, "epoch": 0.00576, "frac_reward_zero_std": 0.5, "grad_norm": 1.5779305696487427, "kl": 0.08434087503701448, "learning_rate": 5.99997691162403e-06, "loss": -0.0636, "num_tokens": 5436596.0, "reward": 0.6024999618530273, "reward_std": 0.2729267477989197, "rewards/rollout_reward_func/mean": 0.6024999618530273, "rewards/rollout_reward_func/std": 0.4505694806575775, "sampling/importance_sampling_ratio/max": 2.754258632659912, "sampling/importance_sampling_ratio/mean": 0.9120872020721436, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.2350893020629883, "sampling/sampling_logp_difference/mean": 0.07831829786300659, "step": 72, "step_time": 12.011820702999785 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0034722222480922937, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005208333372138441, "completions/clipped_ratio": 0.0, "completions/max_length": 2780.0, "completions/max_terminated_length": 2780.0, "completions/mean_length": 1720.125, "completions/mean_terminated_length": 1720.125, "completions/min_length": 1054.0, "completions/min_terminated_length": 1054.0, "entropy": 0.35359790176153183, "epoch": 0.00584, "frac_reward_zero_std": 0.375, "grad_norm": 1.760622262954712, "kl": 0.0876467265188694, "learning_rate": 5.99997561112372e-06, "loss": -0.0248, "num_tokens": 5503333.0, "reward": 0.7487499713897705, "reward_std": 0.2987908720970154, "rewards/rollout_reward_func/mean": 0.7487499713897705, "rewards/rollout_reward_func/std": 0.46135663986206055, "sampling/importance_sampling_ratio/max": 2.7877893447875977, "sampling/importance_sampling_ratio/mean": 0.8998199701309204, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.3734312057495117, "sampling/sampling_logp_difference/mean": 0.07869358360767365, "step": 73, "step_time": 13.119324159999906 }, { "clip_ratio/high_max": 0.013701201416552067, "clip_ratio/high_mean": 0.005070037324912846, "clip_ratio/low_mean": 0.0030487803742289543, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008118817582726479, "completions/clipped_ratio": 0.0, "completions/max_length": 2825.0, "completions/max_terminated_length": 2825.0, "completions/mean_length": 1916.5, "completions/mean_terminated_length": 1916.5, "completions/min_length": 1053.0, "completions/min_terminated_length": 1053.0, "entropy": 0.38354693353176117, "epoch": 0.00592, "frac_reward_zero_std": 0.625, "grad_norm": 0.8614935874938965, "kl": 0.05992862023413181, "learning_rate": 5.999974274993527e-06, "loss": 0.012, "num_tokens": 5576952.0, "reward": 0.5674999952316284, "reward_std": 0.20719751715660095, "rewards/rollout_reward_func/mean": 0.5674999952316284, "rewards/rollout_reward_func/std": 0.4317331612110138, "sampling/importance_sampling_ratio/max": 1.6715911626815796, "sampling/importance_sampling_ratio/mean": 0.8377959132194519, "sampling/importance_sampling_ratio/min": 0.196980819106102, "sampling/sampling_logp_difference/max": 0.913780689239502, "sampling/sampling_logp_difference/mean": 0.06950188428163528, "step": 74, "step_time": 12.793109332000085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2785.0, "completions/max_terminated_length": 2785.0, "completions/mean_length": 1587.53125, "completions/mean_terminated_length": 1587.53125, "completions/min_length": 1054.0, "completions/min_terminated_length": 1054.0, "entropy": 0.35842984169721603, "epoch": 0.006, "frac_reward_zero_std": 0.375, "grad_norm": 1.3697266578674316, "kl": 0.05413582641631365, "learning_rate": 5.99997290323347e-06, "loss": -0.0661, "num_tokens": 5639305.0, "reward": 0.7106249928474426, "reward_std": 0.30803900957107544, "rewards/rollout_reward_func/mean": 0.7106249928474426, "rewards/rollout_reward_func/std": 0.4498884081840515, "sampling/importance_sampling_ratio/max": 1.9579815864562988, "sampling/importance_sampling_ratio/mean": 0.8830677270889282, "sampling/importance_sampling_ratio/min": 0.15943297743797302, "sampling/sampling_logp_difference/max": 1.0672590732574463, "sampling/sampling_logp_difference/mean": 0.06444922834634781, "step": 75, "step_time": 12.032428736000156 }, { "epoch": 0.006, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 2488.6, "eval_completions/max_terminated_length": 2488.6, "eval_completions/mean_length": 1951.175, "eval_completions/mean_terminated_length": 1951.175, "eval_completions/min_length": 1361.4, "eval_completions/min_terminated_length": 1361.4, "eval_entropy": 0.37236364781856535, "eval_frac_reward_zero_std": 0.1, "eval_kl": 0.07629953697323799, "eval_loss": -0.0013724860036745667, "eval_num_tokens": 5639305.0, "eval_reward": 0.47524999976158144, "eval_reward_std": 0.37039353847503664, "eval_rewards/rollout_reward_func/mean": 0.47524999976158144, "eval_rewards/rollout_reward_func/std": 0.37039353176951406, "eval_runtime": 10.5117, "eval_samples_per_second": 0.951, "eval_sampling/importance_sampling_ratio/max": 1.6587244033813477, "eval_sampling/importance_sampling_ratio/mean": 0.8837172389030457, "eval_sampling/importance_sampling_ratio/min": 0.32487900257110597, "eval_sampling/sampling_logp_difference/max": 0.8625046908855438, "eval_sampling/sampling_logp_difference/mean": 0.06909476891160012, "eval_steps_per_second": 0.285, "step": 75 } ], "logging_steps": 1.0, "max_steps": 25000, "num_input_tokens_seen": 5639305, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }