diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,2542 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.006, + "eval_steps": 500, + "global_step": 75, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2783.0, + "completions/max_terminated_length": 2783.0, + "completions/mean_length": 2052.75, + "completions/mean_terminated_length": 2052.75, + "completions/min_length": 1055.0, + "completions/min_terminated_length": 1055.0, + "entropy": 0.4163087382912636, + "epoch": 8e-05, + "frac_reward_zero_std": 0.375, + "grad_norm": 1.8663769960403442, + "kl": 0.0, + "learning_rate": 0.0, + "loss": -0.0386, + "num_tokens": 78630.0, + "reward": 0.46406251192092896, + "reward_std": 0.20054946839809418, + "rewards/rollout_reward_func/mean": 0.46406251192092896, + "rewards/rollout_reward_func/std": 0.37604784965515137, + "sampling/importance_sampling_ratio/max": 2.1498024463653564, + "sampling/importance_sampling_ratio/mean": 1.0975958108901978, + "sampling/importance_sampling_ratio/min": 0.241215318441391, + "sampling/sampling_logp_difference/max": 0.7405228614807129, + "sampling/sampling_logp_difference/mean": 0.039819031953811646, + "step": 1, + "step_time": 14.418279634999976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2798.0, + "completions/max_terminated_length": 2798.0, + "completions/mean_length": 2084.21875, + "completions/mean_terminated_length": 2084.21875, + "completions/min_length": 1052.0, + "completions/min_terminated_length": 1052.0, + "entropy": 0.3995310440659523, + "epoch": 0.00016, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.142817735671997, + "kl": 0.0, + "learning_rate": 1.7142857142857143e-07, + "loss": 0.016, + "num_tokens": 158194.0, + "reward": 0.3384375274181366, + "reward_std": 0.16842570900917053, + "rewards/rollout_reward_func/mean": 0.3384375274181366, + "rewards/rollout_reward_func/std": 0.27340278029441833, + "sampling/importance_sampling_ratio/max": 1.9602876901626587, + "sampling/importance_sampling_ratio/mean": 0.992855966091156, + "sampling/importance_sampling_ratio/min": 0.46628525853157043, + "sampling/sampling_logp_difference/max": 0.6929764747619629, + "sampling/sampling_logp_difference/mean": 0.04201715067028999, + "step": 2, + "step_time": 13.260429134000105 + }, + { + "clip_ratio/high_max": 0.04315628902986646, + "clip_ratio/high_mean": 0.012242560740560293, + "clip_ratio/low_mean": 0.011964043835178018, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.024206604342907667, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2776.0, + "completions/max_terminated_length": 2776.0, + "completions/mean_length": 1875.09375, + "completions/mean_terminated_length": 1875.09375, + "completions/min_length": 1052.0, + "completions/min_terminated_length": 1052.0, + "entropy": 0.38934508711099625, + "epoch": 0.00024, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.2311601638793945, + "kl": 0.003617420152295381, + "learning_rate": 3.4285714285714286e-07, + "loss": -0.0954, + "num_tokens": 230320.0, + "reward": 0.4612500071525574, + "reward_std": 0.22380851209163666, + "rewards/rollout_reward_func/mean": 0.4612500071525574, + "rewards/rollout_reward_func/std": 0.3984546363353729, + "sampling/importance_sampling_ratio/max": 1.6067352294921875, + "sampling/importance_sampling_ratio/mean": 0.9242645502090454, + "sampling/importance_sampling_ratio/min": 0.17279618978500366, + "sampling/sampling_logp_difference/max": 1.4119317531585693, + "sampling/sampling_logp_difference/mean": 0.045969706028699875, + "step": 3, + "step_time": 12.304947445999915 + }, + { + "clip_ratio/high_max": 0.023281023371964693, + "clip_ratio/high_mean": 0.012716594734229147, + "clip_ratio/low_mean": 0.01039634458720684, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.023112939670681953, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2783.0, + "completions/max_terminated_length": 2783.0, + "completions/mean_length": 2251.84375, + "completions/mean_terminated_length": 2251.84375, + "completions/min_length": 1562.0, + "completions/min_terminated_length": 1562.0, + "entropy": 0.4188930094242096, + "epoch": 0.00032, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.3432626724243164, + "kl": 0.005323103512637317, + "learning_rate": 5.142857142857143e-07, + "loss": -0.1037, + "num_tokens": 315875.0, + "reward": 0.2640625238418579, + "reward_std": 0.07438889145851135, + "rewards/rollout_reward_func/mean": 0.2640625238418579, + "rewards/rollout_reward_func/std": 0.09810657054185867, + "sampling/importance_sampling_ratio/max": 2.92923903465271, + "sampling/importance_sampling_ratio/mean": 1.0071074962615967, + "sampling/importance_sampling_ratio/min": 0.30356213450431824, + "sampling/sampling_logp_difference/max": 0.9253432750701904, + "sampling/sampling_logp_difference/mean": 0.04933081567287445, + "step": 4, + "step_time": 13.287707804000092 + }, + { + "clip_ratio/high_max": 0.04239537985995412, + "clip_ratio/high_mean": 0.018673060229048133, + "clip_ratio/low_mean": 0.0042297979816794395, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.022902858443558216, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2790.0, + "completions/max_terminated_length": 2790.0, + "completions/mean_length": 2197.3125, + "completions/mean_terminated_length": 2197.3125, + "completions/min_length": 1559.0, + "completions/min_terminated_length": 1559.0, + "entropy": 0.4414307102560997, + "epoch": 0.0004, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.470518112182617, + "kl": 0.004553150560241193, + "learning_rate": 6.857142857142857e-07, + "loss": 0.1372, + "num_tokens": 399370.0, + "reward": 0.40281248092651367, + "reward_std": 0.16662904620170593, + "rewards/rollout_reward_func/mean": 0.40281248092651367, + "rewards/rollout_reward_func/std": 0.3357921242713928, + "sampling/importance_sampling_ratio/max": 2.2576870918273926, + "sampling/importance_sampling_ratio/mean": 1.0002690553665161, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 1.462371826171875, + "sampling/sampling_logp_difference/mean": 0.053694289177656174, + "step": 5, + "step_time": 13.068715858000132 + }, + { + "clip_ratio/high_max": 0.02923969691619277, + "clip_ratio/high_mean": 0.01021690119523555, + "clip_ratio/low_mean": 0.01101089478470385, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.021227796096354723, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2428.0, + "completions/max_terminated_length": 2428.0, + "completions/mean_length": 1826.09375, + "completions/mean_terminated_length": 1826.09375, + "completions/min_length": 1053.0, + "completions/min_terminated_length": 1053.0, + "entropy": 0.37763065844774246, + "epoch": 0.00048, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4737789630889893, + "kl": 0.003610707528423518, + "learning_rate": 8.571428571428571e-07, + "loss": 0.0212, + "num_tokens": 469858.0, + "reward": 0.4584375023841858, + "reward_std": 0.2892817258834839, + "rewards/rollout_reward_func/mean": 0.4584375023841858, + "rewards/rollout_reward_func/std": 0.4035496413707733, + "sampling/importance_sampling_ratio/max": 1.8672934770584106, + "sampling/importance_sampling_ratio/mean": 0.9250987768173218, + "sampling/importance_sampling_ratio/min": 0.2111542820930481, + "sampling/sampling_logp_difference/max": 1.105020523071289, + "sampling/sampling_logp_difference/mean": 0.04392547905445099, + "step": 6, + "step_time": 11.808095918000163 + }, + { + "clip_ratio/high_max": 0.02163859363645315, + "clip_ratio/high_mean": 0.007195362821221352, + "clip_ratio/low_mean": 0.009288194705732167, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.01648355764336884, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2799.0, + "completions/max_terminated_length": 2799.0, + "completions/mean_length": 2101.9375, + "completions/mean_terminated_length": 2101.9375, + "completions/min_length": 1053.0, + "completions/min_terminated_length": 1053.0, + "entropy": 0.4031049609184265, + "epoch": 0.00056, + "frac_reward_zero_std": 0.375, + "grad_norm": 2.241011142730713, + "kl": 0.004900285159237683, + "learning_rate": 1.0285714285714286e-06, + "loss": 0.0307, + "num_tokens": 549695.0, + "reward": 0.32218751311302185, + "reward_std": 0.10592572391033173, + "rewards/rollout_reward_func/mean": 0.32218751311302185, + "rewards/rollout_reward_func/std": 0.22224271297454834, + "sampling/importance_sampling_ratio/max": 2.7520875930786133, + "sampling/importance_sampling_ratio/mean": 0.9687752723693848, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 0.8389774560928345, + "sampling/sampling_logp_difference/mean": 0.043909620493650436, + "step": 7, + "step_time": 13.085814365000147 + }, + { + "clip_ratio/high_max": 0.029183519072830677, + "clip_ratio/high_mean": 0.008625667076557875, + "clip_ratio/low_mean": 0.016130636679008603, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.02475630398839712, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2794.0, + "completions/max_terminated_length": 2794.0, + "completions/mean_length": 2250.71875, + "completions/mean_terminated_length": 2250.71875, + "completions/min_length": 1570.0, + "completions/min_terminated_length": 1570.0, + "entropy": 0.43513813614845276, + "epoch": 0.00064, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.6571757793426514, + "kl": 0.0038885354879312217, + "learning_rate": 1.2000000000000002e-06, + "loss": -0.0896, + "num_tokens": 634822.0, + "reward": 0.30375000834465027, + "reward_std": 0.11063194274902344, + "rewards/rollout_reward_func/mean": 0.30375000834465027, + "rewards/rollout_reward_func/std": 0.22577106952667236, + "sampling/importance_sampling_ratio/max": 2.24173641204834, + "sampling/importance_sampling_ratio/mean": 0.9777867794036865, + "sampling/importance_sampling_ratio/min": 0.4010058343410492, + "sampling/sampling_logp_difference/max": 0.9179394245147705, + "sampling/sampling_logp_difference/mean": 0.0499531514942646, + "step": 8, + "step_time": 13.055169309000007 + }, + { + "clip_ratio/high_max": 0.014742525294423103, + "clip_ratio/high_mean": 0.003685631323605776, + "clip_ratio/low_mean": 0.008176195668056607, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.011861827224493027, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2810.0, + "completions/max_terminated_length": 2810.0, + "completions/mean_length": 1657.03125, + "completions/mean_terminated_length": 1657.03125, + "completions/min_length": 1053.0, + "completions/min_terminated_length": 1053.0, + "entropy": 0.39970337599515915, + "epoch": 0.00072, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.901354432106018, + "kl": 0.005712996702641249, + "learning_rate": 1.3714285714285715e-06, + "loss": 0.0004, + "num_tokens": 699616.0, + "reward": 0.3528124690055847, + "reward_std": 0.20240315794944763, + "rewards/rollout_reward_func/mean": 0.3528124690055847, + "rewards/rollout_reward_func/std": 0.3597510755062103, + "sampling/importance_sampling_ratio/max": 2.387613296508789, + "sampling/importance_sampling_ratio/mean": 1.0771517753601074, + "sampling/importance_sampling_ratio/min": 0.5435174703598022, + "sampling/sampling_logp_difference/max": 0.6833771467208862, + "sampling/sampling_logp_difference/mean": 0.04181923717260361, + "step": 9, + "step_time": 13.294292020000057 + }, + { + "clip_ratio/high_max": 0.03906210558488965, + "clip_ratio/high_mean": 0.015391179244033992, + "clip_ratio/low_mean": 0.0073633925057947636, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.022754571866244078, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2793.0, + "completions/max_terminated_length": 2793.0, + "completions/mean_length": 2214.53125, + "completions/mean_terminated_length": 2214.53125, + "completions/min_length": 1052.0, + "completions/min_terminated_length": 1052.0, + "entropy": 0.41637370735406876, + "epoch": 0.0008, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.541189670562744, + "kl": 0.004748326842673123, + "learning_rate": 1.5428571428571428e-06, + "loss": -0.0168, + "num_tokens": 783707.0, + "reward": 0.4284375309944153, + "reward_std": 0.1260128915309906, + "rewards/rollout_reward_func/mean": 0.4284375309944153, + "rewards/rollout_reward_func/std": 0.3622608780860901, + "sampling/importance_sampling_ratio/max": 2.2261769771575928, + "sampling/importance_sampling_ratio/mean": 1.042180061340332, + "sampling/importance_sampling_ratio/min": 0.2320551723241806, + "sampling/sampling_logp_difference/max": 1.021528959274292, + "sampling/sampling_logp_difference/mean": 0.04730905592441559, + "step": 10, + "step_time": 13.637110585000073 + }, + { + "clip_ratio/high_max": 0.010990338400006294, + "clip_ratio/high_mean": 0.0027475846000015736, + "clip_ratio/low_mean": 0.0016025641234591603, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.004350148723460734, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2793.0, + "completions/max_terminated_length": 2793.0, + "completions/mean_length": 1966.4375, + "completions/mean_terminated_length": 1966.4375, + "completions/min_length": 1055.0, + "completions/min_terminated_length": 1055.0, + "entropy": 0.4171219617128372, + "epoch": 0.00088, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.4207873344421387, + "kl": 0.0035471616429276764, + "learning_rate": 1.7142857142857143e-06, + "loss": 0.0422, + "num_tokens": 858994.0, + "reward": 0.4596875309944153, + "reward_std": 0.14279377460479736, + "rewards/rollout_reward_func/mean": 0.4596875309944153, + "rewards/rollout_reward_func/std": 0.3725802004337311, + "sampling/importance_sampling_ratio/max": 1.874053716659546, + "sampling/importance_sampling_ratio/mean": 0.9027889966964722, + "sampling/importance_sampling_ratio/min": 0.45684853196144104, + "sampling/sampling_logp_difference/max": 0.5253086090087891, + "sampling/sampling_logp_difference/mean": 0.0447448305785656, + "step": 11, + "step_time": 12.90678849699998 + }, + { + "clip_ratio/high_max": 0.039288708940148354, + "clip_ratio/high_mean": 0.017087680520489812, + "clip_ratio/low_mean": 0.008439590455964208, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.02552727097645402, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2782.0, + "completions/max_terminated_length": 2782.0, + "completions/mean_length": 1859.5625, + "completions/mean_terminated_length": 1859.5625, + "completions/min_length": 1053.0, + "completions/min_terminated_length": 1053.0, + "entropy": 0.42384539544582367, + "epoch": 0.00096, + "frac_reward_zero_std": 0.375, + "grad_norm": 2.1793887615203857, + "kl": 0.004719441349152476, + "learning_rate": 1.8857142857142858e-06, + "loss": -0.0501, + "num_tokens": 930647.0, + "reward": 0.5653125047683716, + "reward_std": 0.09132834523916245, + "rewards/rollout_reward_func/mean": 0.5653125047683716, + "rewards/rollout_reward_func/std": 0.4122869372367859, + "sampling/importance_sampling_ratio/max": 1.968488097190857, + "sampling/importance_sampling_ratio/mean": 1.1238960027694702, + "sampling/importance_sampling_ratio/min": 0.5891481637954712, + "sampling/sampling_logp_difference/max": 0.9189000129699707, + "sampling/sampling_logp_difference/mean": 0.045362215489149094, + "step": 12, + "step_time": 12.105040577999944 + }, + { + "clip_ratio/high_max": 0.03163956617936492, + "clip_ratio/high_mean": 0.009185401839204133, + "clip_ratio/low_mean": 0.01267810445278883, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.021863506408408284, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2794.0, + "completions/max_terminated_length": 2794.0, + "completions/mean_length": 1744.0625, + "completions/mean_terminated_length": 1744.0625, + "completions/min_length": 1053.0, + "completions/min_terminated_length": 1053.0, + "entropy": 0.38077671080827713, + "epoch": 0.00104, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.601933240890503, + "kl": 0.0043588640401139855, + "learning_rate": 2.0571428571428573e-06, + "loss": -0.0271, + "num_tokens": 998268.0, + "reward": 0.5040624737739563, + "reward_std": 0.3648141622543335, + "rewards/rollout_reward_func/mean": 0.5040624737739563, + "rewards/rollout_reward_func/std": 0.4420104920864105, + "sampling/importance_sampling_ratio/max": 2.2825241088867188, + "sampling/importance_sampling_ratio/mean": 1.0028969049453735, + "sampling/importance_sampling_ratio/min": 0.37051475048065186, + "sampling/sampling_logp_difference/max": 0.6929263472557068, + "sampling/sampling_logp_difference/mean": 0.043037254363298416, + "step": 13, + "step_time": 12.434848872999964 + }, + { + "clip_ratio/high_max": 0.0682385629042983, + "clip_ratio/high_mean": 0.022985405288636684, + "clip_ratio/low_mean": 0.0055555556900799274, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.028540961910039186, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2797.0, + "completions/max_terminated_length": 2797.0, + "completions/mean_length": 2003.40625, + "completions/mean_terminated_length": 2003.40625, + "completions/min_length": 1052.0, + "completions/min_terminated_length": 1052.0, + "entropy": 0.40549617260694504, + "epoch": 0.00112, + "frac_reward_zero_std": 0.125, + "grad_norm": 2.6730706691741943, + "kl": 0.004465080099180341, + "learning_rate": 2.2285714285714286e-06, + "loss": 0.0367, + "num_tokens": 1075200.0, + "reward": 0.3971875011920929, + "reward_std": 0.24656714498996735, + "rewards/rollout_reward_func/mean": 0.3971875011920929, + "rewards/rollout_reward_func/std": 0.3921506702899933, + "sampling/importance_sampling_ratio/max": 2.08994197845459, + "sampling/importance_sampling_ratio/mean": 0.9472236037254333, + "sampling/importance_sampling_ratio/min": 0.2920815646648407, + "sampling/sampling_logp_difference/max": 0.5747750997543335, + "sampling/sampling_logp_difference/mean": 0.0427585169672966, + "step": 14, + "step_time": 13.12732943400033 + }, + { + "clip_ratio/high_max": 0.04484127042815089, + "clip_ratio/high_mean": 0.01285505446139723, + "clip_ratio/low_mean": 0.008134920848533511, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.020989975426346064, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2801.0, + "completions/max_terminated_length": 2801.0, + "completions/mean_length": 1879.96875, + "completions/mean_terminated_length": 1879.96875, + "completions/min_length": 1052.0, + "completions/min_terminated_length": 1052.0, + "entropy": 0.3704817444086075, + "epoch": 0.0012, + "frac_reward_zero_std": 0.375, + "grad_norm": 1.3741340637207031, + "kl": 0.004930144699756056, + "learning_rate": 2.4000000000000003e-06, + "loss": -0.036, + "num_tokens": 1147654.0, + "reward": 0.5634374618530273, + "reward_std": 0.2032102644443512, + "rewards/rollout_reward_func/mean": 0.5634374618530273, + "rewards/rollout_reward_func/std": 0.4479026794433594, + "sampling/importance_sampling_ratio/max": 1.419919490814209, + "sampling/importance_sampling_ratio/mean": 0.8213506937026978, + "sampling/importance_sampling_ratio/min": 0.2297196239233017, + "sampling/sampling_logp_difference/max": 0.9635820388793945, + "sampling/sampling_logp_difference/mean": 0.04450097680091858, + "step": 15, + "step_time": 12.787183091000088 + }, + { + "clip_ratio/high_max": 0.040403091348707676, + "clip_ratio/high_mean": 0.019732415094040334, + "clip_ratio/low_mean": 0.011093285749666393, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.030825700610876083, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2812.0, + "completions/max_terminated_length": 2812.0, + "completions/mean_length": 2226.75, + "completions/mean_terminated_length": 2226.75, + "completions/min_length": 1054.0, + "completions/min_terminated_length": 1054.0, + "entropy": 0.4390428438782692, + "epoch": 0.00128, + "frac_reward_zero_std": 0.125, + "grad_norm": 3.3430113792419434, + "kl": 0.005434123100712895, + "learning_rate": 2.571428571428571e-06, + "loss": 0.0608, + "num_tokens": 1232479.0, + "reward": 0.35874998569488525, + "reward_std": 0.16885429620742798, + "rewards/rollout_reward_func/mean": 0.35874998569488525, + "rewards/rollout_reward_func/std": 0.31368517875671387, + "sampling/importance_sampling_ratio/max": 2.1880178451538086, + "sampling/importance_sampling_ratio/mean": 0.9618589878082275, + "sampling/importance_sampling_ratio/min": 0.12961336970329285, + "sampling/sampling_logp_difference/max": 0.941362738609314, + "sampling/sampling_logp_difference/mean": 0.05188923329114914, + "step": 16, + "step_time": 13.173405885999728 + }, + { + "clip_ratio/high_max": 0.04390919208526611, + "clip_ratio/high_mean": 0.017439239425584674, + "clip_ratio/low_mean": 0.007801226573064923, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.025240465998649597, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2434.0, + "completions/max_terminated_length": 2434.0, + "completions/mean_length": 1750.0625, + "completions/mean_terminated_length": 1750.0625, + "completions/min_length": 1052.0, + "completions/min_terminated_length": 1052.0, + "entropy": 0.3980557546019554, + "epoch": 0.00136, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.602077007293701, + "kl": 0.004334585275501013, + "learning_rate": 2.742857142857143e-06, + "loss": -0.0264, + "num_tokens": 1300635.0, + "reward": 0.38499999046325684, + "reward_std": 0.221183180809021, + "rewards/rollout_reward_func/mean": 0.38499999046325684, + "rewards/rollout_reward_func/std": 0.34839722514152527, + "sampling/importance_sampling_ratio/max": 1.7235372066497803, + "sampling/importance_sampling_ratio/mean": 0.9467421770095825, + "sampling/importance_sampling_ratio/min": 0.2654297649860382, + "sampling/sampling_logp_difference/max": 0.7773740887641907, + "sampling/sampling_logp_difference/mean": 0.04712219163775444, + "step": 17, + "step_time": 11.411276259999795 + }, + { + "clip_ratio/high_max": 0.04594441968947649, + "clip_ratio/high_mean": 0.013718247646465898, + "clip_ratio/low_mean": 0.004949534311890602, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.018667781492695212, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2798.0, + "completions/max_terminated_length": 2798.0, + "completions/mean_length": 2309.09375, + "completions/mean_terminated_length": 2309.09375, + "completions/min_length": 1055.0, + "completions/min_terminated_length": 1055.0, + "entropy": 0.4307108670473099, + "epoch": 0.00144, + "frac_reward_zero_std": 0.375, + "grad_norm": 1.9505234956741333, + "kl": 0.004669323505368084, + "learning_rate": 2.9142857142857142e-06, + "loss": 0.0981, + "num_tokens": 1388529.0, + "reward": 0.3696874976158142, + "reward_std": 0.155008003115654, + "rewards/rollout_reward_func/mean": 0.3696874976158142, + "rewards/rollout_reward_func/std": 0.28414538502693176, + "sampling/importance_sampling_ratio/max": 1.8336728811264038, + "sampling/importance_sampling_ratio/mean": 0.9352109432220459, + "sampling/importance_sampling_ratio/min": 0.28059616684913635, + "sampling/sampling_logp_difference/max": 1.0694303512573242, + "sampling/sampling_logp_difference/mean": 0.05270082503557205, + "step": 18, + "step_time": 13.463537552000162 + }, + { + "clip_ratio/high_max": 0.03351574344560504, + "clip_ratio/high_mean": 0.017963151913136244, + "clip_ratio/low_mean": 0.005672972998581827, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.023636124562472105, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2782.0, + "completions/max_terminated_length": 2782.0, + "completions/mean_length": 2203.90625, + "completions/mean_terminated_length": 2203.90625, + "completions/min_length": 1564.0, + "completions/min_terminated_length": 1564.0, + "entropy": 0.4163732975721359, + "epoch": 0.00152, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.54580020904541, + "kl": 0.0039027896127663553, + "learning_rate": 3.0857142857142855e-06, + "loss": -0.0385, + "num_tokens": 1472480.0, + "reward": 0.2887499928474426, + "reward_std": 0.1067335307598114, + "rewards/rollout_reward_func/mean": 0.2887499928474426, + "rewards/rollout_reward_func/std": 0.17496080696582794, + "sampling/importance_sampling_ratio/max": 2.46917724609375, + "sampling/importance_sampling_ratio/mean": 1.0520013570785522, + "sampling/importance_sampling_ratio/min": 0.31319668889045715, + "sampling/sampling_logp_difference/max": 0.6751515865325928, + "sampling/sampling_logp_difference/mean": 0.04795370250940323, + "step": 19, + "step_time": 12.881922476999762 + }, + { + "clip_ratio/high_max": 0.03289473615586758, + "clip_ratio/high_mean": 0.016854635905474424, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.016854635905474424, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2803.0, + "completions/max_terminated_length": 2803.0, + "completions/mean_length": 1839.5625, + "completions/mean_terminated_length": 1839.5625, + "completions/min_length": 1053.0, + "completions/min_terminated_length": 1053.0, + "entropy": 0.3896697387099266, + "epoch": 0.0016, + "frac_reward_zero_std": 0.5, + "grad_norm": 3.0274949073791504, + "kl": 0.004332752665504813, + "learning_rate": 3.257142857142857e-06, + "loss": 0.1087, + "num_tokens": 1543546.0, + "reward": 0.4571874737739563, + "reward_std": 0.20620864629745483, + "rewards/rollout_reward_func/mean": 0.4571874737739563, + "rewards/rollout_reward_func/std": 0.38446637988090515, + "sampling/importance_sampling_ratio/max": 2.2497854232788086, + "sampling/importance_sampling_ratio/mean": 0.9864073395729065, + "sampling/importance_sampling_ratio/min": 0.3370327055454254, + "sampling/sampling_logp_difference/max": 0.9195313453674316, + "sampling/sampling_logp_difference/mean": 0.04598519578576088, + "step": 20, + "step_time": 13.28698261799991 + }, + { + "clip_ratio/high_max": 0.047807968221604824, + "clip_ratio/high_mean": 0.017668311716988683, + "clip_ratio/low_mean": 0.0087070451118052, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.026375357527285814, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2784.0, + "completions/max_terminated_length": 2784.0, + "completions/mean_length": 1916.84375, + "completions/mean_terminated_length": 1916.84375, + "completions/min_length": 1052.0, + "completions/min_terminated_length": 1052.0, + "entropy": 0.3857065215706825, + "epoch": 0.00168, + "frac_reward_zero_std": 0.125, + "grad_norm": 2.2928388118743896, + "kl": 0.0030007859459146857, + "learning_rate": 3.4285714285714285e-06, + "loss": 0.0691, + "num_tokens": 1617299.0, + "reward": 0.5199999809265137, + "reward_std": 0.24810142815113068, + "rewards/rollout_reward_func/mean": 0.5199999809265137, + "rewards/rollout_reward_func/std": 0.44394853711128235, + "sampling/importance_sampling_ratio/max": 1.9692103862762451, + "sampling/importance_sampling_ratio/mean": 1.0206944942474365, + "sampling/importance_sampling_ratio/min": 0.37676262855529785, + "sampling/sampling_logp_difference/max": 0.5263509750366211, + "sampling/sampling_logp_difference/mean": 0.04122690111398697, + "step": 21, + "step_time": 12.188486421000334 + }, + { + "clip_ratio/high_max": 0.040458154398947954, + "clip_ratio/high_mean": 0.011364538804627955, + "clip_ratio/low_mean": 0.006526540499180555, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.017891079653054476, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2801.0, + "completions/max_terminated_length": 2801.0, + "completions/mean_length": 1878.4375, + "completions/mean_terminated_length": 1878.4375, + "completions/min_length": 1052.0, + "completions/min_terminated_length": 1052.0, + "entropy": 0.3931129276752472, + "epoch": 0.00176, + "frac_reward_zero_std": 0.125, + "grad_norm": 1.7913806438446045, + "kl": 0.004387594643048942, + "learning_rate": 3.6e-06, + "loss": -0.0657, + "num_tokens": 1690165.0, + "reward": 0.48281246423721313, + "reward_std": 0.24489575624465942, + "rewards/rollout_reward_func/mean": 0.48281246423721313, + "rewards/rollout_reward_func/std": 0.42833685874938965, + "sampling/importance_sampling_ratio/max": 2.001044750213623, + "sampling/importance_sampling_ratio/mean": 0.8716533780097961, + "sampling/importance_sampling_ratio/min": 0.21946659684181213, + "sampling/sampling_logp_difference/max": 0.6549723148345947, + "sampling/sampling_logp_difference/mean": 0.04290828853845596, + "step": 22, + "step_time": 12.900562208999872 + }, + { + "clip_ratio/high_max": 0.012908496893942356, + "clip_ratio/high_mean": 0.003227124223485589, + "clip_ratio/low_mean": 0.0013888889225199819, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.004616013146005571, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2795.0, + "completions/max_terminated_length": 2795.0, + "completions/mean_length": 1921.96875, + "completions/mean_terminated_length": 1921.96875, + "completions/min_length": 1053.0, + "completions/min_terminated_length": 1053.0, + "entropy": 0.3991682603955269, + "epoch": 0.00184, + "frac_reward_zero_std": 0.375, + "grad_norm": 2.757072687149048, + "kl": 0.002871026110369712, + "learning_rate": 3.7714285714285716e-06, + "loss": -0.0322, + "num_tokens": 1764351.0, + "reward": 0.4725000262260437, + "reward_std": 0.15098075568675995, + "rewards/rollout_reward_func/mean": 0.4725000262260437, + "rewards/rollout_reward_func/std": 0.3937331438064575, + "sampling/importance_sampling_ratio/max": 2.473691463470459, + "sampling/importance_sampling_ratio/mean": 1.0277502536773682, + "sampling/importance_sampling_ratio/min": 0.3683130145072937, + "sampling/sampling_logp_difference/max": 0.8325839042663574, + "sampling/sampling_logp_difference/mean": 0.041013769805431366, + "step": 23, + "step_time": 12.923486550999996 + }, + { + "clip_ratio/high_max": 0.022044573910534382, + "clip_ratio/high_mean": 0.0072472544852644205, + "clip_ratio/low_mean": 0.007787698996253312, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015034952783025801, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2791.0, + "completions/max_terminated_length": 2791.0, + "completions/mean_length": 2014.1875, + "completions/mean_terminated_length": 2014.1875, + "completions/min_length": 1054.0, + "completions/min_terminated_length": 1054.0, + "entropy": 0.39251676946878433, + "epoch": 0.00192, + "frac_reward_zero_std": 0.375, + "grad_norm": 1.9654884338378906, + "kl": 0.0081728242803365, + "learning_rate": 3.942857142857143e-06, + "loss": -0.0383, + "num_tokens": 1841628.0, + "reward": 0.35874998569488525, + "reward_std": 0.21719886362552643, + "rewards/rollout_reward_func/mean": 0.35874998569488525, + "rewards/rollout_reward_func/std": 0.31252095103263855, + "sampling/importance_sampling_ratio/max": 2.0834484100341797, + "sampling/importance_sampling_ratio/mean": 0.9893499612808228, + "sampling/importance_sampling_ratio/min": 0.06596492230892181, + "sampling/sampling_logp_difference/max": 1.764291524887085, + "sampling/sampling_logp_difference/mean": 0.05037356913089752, + "step": 24, + "step_time": 12.55188547500029 + }, + { + "clip_ratio/high_max": 0.03194444486871362, + "clip_ratio/high_mean": 0.009474206599406898, + "clip_ratio/low_mean": 0.004620927385985851, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.014095134101808071, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2432.0, + "completions/max_terminated_length": 2432.0, + "completions/mean_length": 1997.46875, + "completions/mean_terminated_length": 1997.46875, + "completions/min_length": 1052.0, + "completions/min_terminated_length": 1052.0, + "entropy": 0.40253835916519165, + "epoch": 0.002, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.1649582386016846, + "kl": 0.007934511464554816, + "learning_rate": 4.114285714285715e-06, + "loss": -0.084, + "num_tokens": 1918276.0, + "reward": 0.3425000011920929, + "reward_std": 0.16030071675777435, + "rewards/rollout_reward_func/mean": 0.3425000011920929, + "rewards/rollout_reward_func/std": 0.27845191955566406, + "sampling/importance_sampling_ratio/max": 1.7379083633422852, + "sampling/importance_sampling_ratio/mean": 1.0123233795166016, + "sampling/importance_sampling_ratio/min": 0.21978217363357544, + "sampling/sampling_logp_difference/max": 0.9820888042449951, + "sampling/sampling_logp_difference/mean": 0.043975915759801865, + "step": 25, + "step_time": 11.6416912709999 + }, + { + "clip_ratio/high_max": 0.057189542800188065, + "clip_ratio/high_mean": 0.02329625654965639, + "clip_ratio/low_mean": 0.008795286994427443, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.03209154261276126, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2812.0, + "completions/max_terminated_length": 2812.0, + "completions/mean_length": 2010.0625, + "completions/mean_terminated_length": 2010.0625, + "completions/min_length": 1053.0, + "completions/min_terminated_length": 1053.0, + "entropy": 0.3653796315193176, + "epoch": 0.00208, + "frac_reward_zero_std": 0.125, + "grad_norm": 2.1230344772338867, + "kl": 0.006589570315554738, + "learning_rate": 4.285714285714286e-06, + "loss": -0.0197, + "num_tokens": 1995372.0, + "reward": 0.4256249964237213, + "reward_std": 0.23703671991825104, + "rewards/rollout_reward_func/mean": 0.4256249964237213, + "rewards/rollout_reward_func/std": 0.3602412939071655, + "sampling/importance_sampling_ratio/max": 1.7632914781570435, + "sampling/importance_sampling_ratio/mean": 0.9213794469833374, + "sampling/importance_sampling_ratio/min": 0.4378761649131775, + "sampling/sampling_logp_difference/max": 0.56688392162323, + "sampling/sampling_logp_difference/mean": 0.03944293037056923, + "step": 26, + "step_time": 13.15109654299954 + }, + { + "clip_ratio/high_max": 0.04949874710291624, + "clip_ratio/high_mean": 0.02149919094517827, + "clip_ratio/low_mean": 0.00766741088591516, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.02916660183109343, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2785.0, + "completions/max_terminated_length": 2785.0, + "completions/mean_length": 1842.0625, + "completions/mean_terminated_length": 1842.0625, + "completions/min_length": 1053.0, + "completions/min_terminated_length": 1053.0, + "entropy": 0.4269302785396576, + "epoch": 0.00216, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.6013935804367065, + "kl": 0.00617267657071352, + "learning_rate": 4.457142857142857e-06, + "loss": -0.0345, + "num_tokens": 2066465.0, + "reward": 0.5221875309944153, + "reward_std": 0.22779378294944763, + "rewards/rollout_reward_func/mean": 0.5221875309944153, + "rewards/rollout_reward_func/std": 0.4334239661693573, + "sampling/importance_sampling_ratio/max": 2.312187433242798, + "sampling/importance_sampling_ratio/mean": 0.8621585369110107, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 0.9948511123657227, + "sampling/sampling_logp_difference/mean": 0.051924653351306915, + "step": 27, + "step_time": 12.681567872999949 + }, + { + "clip_ratio/high_max": 0.04371212236583233, + "clip_ratio/high_mean": 0.0183574166148901, + "clip_ratio/low_mean": 0.005908275721594691, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.024265691870823503, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2796.0, + "completions/max_terminated_length": 2796.0, + "completions/mean_length": 2155.5, + "completions/mean_terminated_length": 2155.5, + "completions/min_length": 1053.0, + "completions/min_terminated_length": 1053.0, + "entropy": 0.41429970413446426, + "epoch": 0.00224, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.647275447845459, + "kl": 0.010079714236781001, + "learning_rate": 4.628571428571429e-06, + "loss": -0.0864, + "num_tokens": 2148817.0, + "reward": 0.3021875023841858, + "reward_std": 0.11279378086328506, + "rewards/rollout_reward_func/mean": 0.3021875023841858, + "rewards/rollout_reward_func/std": 0.23064753413200378, + "sampling/importance_sampling_ratio/max": 2.1843345165252686, + "sampling/importance_sampling_ratio/mean": 0.9328470230102539, + "sampling/importance_sampling_ratio/min": 0.11585874110460281, + "sampling/sampling_logp_difference/max": 1.9821176528930664, + "sampling/sampling_logp_difference/mean": 0.05276907980442047, + "step": 28, + "step_time": 12.799536062000016 + }, + { + "clip_ratio/high_max": 0.039141415152698755, + "clip_ratio/high_mean": 0.019034530967473984, + "clip_ratio/low_mean": 0.005208333372138441, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.02424286410678178, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2411.0, + "completions/max_terminated_length": 2411.0, + "completions/mean_length": 1544.21875, + "completions/mean_terminated_length": 1544.21875, + "completions/min_length": 1052.0, + "completions/min_terminated_length": 1052.0, + "entropy": 0.38873114436864853, + "epoch": 0.00232, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.288419485092163, + "kl": 0.008441059850156307, + "learning_rate": 4.800000000000001e-06, + "loss": -0.0294, + "num_tokens": 2209518.0, + "reward": 0.5049999952316284, + "reward_std": 0.367961049079895, + "rewards/rollout_reward_func/mean": 0.5049999952316284, + "rewards/rollout_reward_func/std": 0.4586867392063141, + "sampling/importance_sampling_ratio/max": 1.7176055908203125, + "sampling/importance_sampling_ratio/mean": 0.8919655084609985, + "sampling/importance_sampling_ratio/min": 0.3174732029438019, + "sampling/sampling_logp_difference/max": 1.007685899734497, + "sampling/sampling_logp_difference/mean": 0.043198756873607635, + "step": 29, + "step_time": 11.569315259000177 + }, + { + "clip_ratio/high_max": 0.03119284799322486, + "clip_ratio/high_mean": 0.009251700364984572, + "clip_ratio/low_mean": 0.0032051282469183207, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.012456828728318214, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2776.0, + "completions/max_terminated_length": 2776.0, + "completions/mean_length": 1695.40625, + "completions/mean_terminated_length": 1695.40625, + "completions/min_length": 1052.0, + "completions/min_terminated_length": 1052.0, + "entropy": 0.38929111510515213, + "epoch": 0.0024, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.749756932258606, + "kl": 0.01017191493883729, + "learning_rate": 4.9714285714285715e-06, + "loss": 0.0146, + "num_tokens": 2275561.0, + "reward": 0.5309374928474426, + "reward_std": 0.32216140627861023, + "rewards/rollout_reward_func/mean": 0.5309374928474426, + "rewards/rollout_reward_func/std": 0.4390852451324463, + "sampling/importance_sampling_ratio/max": 2.9540531635284424, + "sampling/importance_sampling_ratio/mean": 1.0208276510238647, + "sampling/importance_sampling_ratio/min": 0.37041175365448, + "sampling/sampling_logp_difference/max": 0.5885751247406006, + "sampling/sampling_logp_difference/mean": 0.04683335870504379, + "step": 30, + "step_time": 12.191692169999897 + }, + { + "clip_ratio/high_max": 0.05563905602321029, + "clip_ratio/high_mean": 0.01747169380541891, + "clip_ratio/low_mean": 0.008184524020180106, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.02565621805842966, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2783.0, + "completions/max_terminated_length": 2783.0, + "completions/mean_length": 1801.09375, + "completions/mean_terminated_length": 1801.09375, + "completions/min_length": 1053.0, + "completions/min_terminated_length": 1053.0, + "entropy": 0.3914438411593437, + "epoch": 0.00248, + "frac_reward_zero_std": 0.125, + "grad_norm": 2.8585875034332275, + "kl": 0.015274998731911182, + "learning_rate": 5.142857142857142e-06, + "loss": 0.0419, + "num_tokens": 2345322.0, + "reward": 0.36281251907348633, + "reward_std": 0.2801453769207001, + "rewards/rollout_reward_func/mean": 0.36281251907348633, + "rewards/rollout_reward_func/std": 0.342911958694458, + "sampling/importance_sampling_ratio/max": 2.163181781768799, + "sampling/importance_sampling_ratio/mean": 0.9487945437431335, + "sampling/importance_sampling_ratio/min": 0.29707521200180054, + "sampling/sampling_logp_difference/max": 0.7824678421020508, + "sampling/sampling_logp_difference/mean": 0.0532098188996315, + "step": 31, + "step_time": 13.19187305000014 + }, + { + "clip_ratio/high_max": 0.03187447274103761, + "clip_ratio/high_mean": 0.018647319404408336, + "clip_ratio/low_mean": 0.004727297928184271, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.02337461756542325, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2435.0, + "completions/max_terminated_length": 2435.0, + "completions/mean_length": 1984.90625, + "completions/mean_terminated_length": 1984.90625, + "completions/min_length": 1052.0, + "completions/min_terminated_length": 1052.0, + "entropy": 0.416415698826313, + "epoch": 0.00256, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.3030495643615723, + "kl": 0.015865659108385444, + "learning_rate": 5.314285714285714e-06, + "loss": -0.0567, + "num_tokens": 2421421.0, + "reward": 0.3878124952316284, + "reward_std": 0.23157384991645813, + "rewards/rollout_reward_func/mean": 0.3878124952316284, + "rewards/rollout_reward_func/std": 0.3412286341190338, + "sampling/importance_sampling_ratio/max": 2.5926010608673096, + "sampling/importance_sampling_ratio/mean": 0.9760158658027649, + "sampling/importance_sampling_ratio/min": 0.2061164528131485, + "sampling/sampling_logp_difference/max": 0.8063008785247803, + "sampling/sampling_logp_difference/mean": 0.04909588024020195, + "step": 32, + "step_time": 11.466194520999807 + }, + { + "clip_ratio/high_max": 0.019717262126505375, + "clip_ratio/high_mean": 0.004929315531626344, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.004929315531626344, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2809.0, + "completions/max_terminated_length": 2809.0, + "completions/mean_length": 2102.71875, + "completions/mean_terminated_length": 2102.71875, + "completions/min_length": 1054.0, + "completions/min_terminated_length": 1054.0, + "entropy": 0.42558059841394424, + "epoch": 0.00264, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.5914040803909302, + "kl": 0.010543531039729714, + "learning_rate": 5.485714285714286e-06, + "loss": 0.0448, + "num_tokens": 2501867.0, + "reward": 0.5221875309944153, + "reward_std": 0.14279377460479736, + "rewards/rollout_reward_func/mean": 0.5221875309944153, + "rewards/rollout_reward_func/std": 0.4007873833179474, + "sampling/importance_sampling_ratio/max": 1.5994207859039307, + "sampling/importance_sampling_ratio/mean": 0.8397550582885742, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 0.9267706871032715, + "sampling/sampling_logp_difference/mean": 0.0471554696559906, + "step": 33, + "step_time": 12.975996798000097 + }, + { + "clip_ratio/high_max": 0.040178571827709675, + "clip_ratio/high_mean": 0.016144166933372617, + "clip_ratio/low_mean": 0.005662594106979668, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.021806761040352285, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2441.0, + "completions/max_terminated_length": 2441.0, + "completions/mean_length": 1488.4375, + "completions/mean_terminated_length": 1488.4375, + "completions/min_length": 1052.0, + "completions/min_terminated_length": 1052.0, + "entropy": 0.35695891827344894, + "epoch": 0.00272, + "frac_reward_zero_std": 0.125, + "grad_norm": 1.6733559370040894, + "kl": 0.020034206565469503, + "learning_rate": 5.6571428571428576e-06, + "loss": -0.0588, + "num_tokens": 2560884.0, + "reward": 0.5859375, + "reward_std": 0.38607701659202576, + "rewards/rollout_reward_func/mean": 0.5859375, + "rewards/rollout_reward_func/std": 0.45654281973838806, + "sampling/importance_sampling_ratio/max": 1.8220971822738647, + "sampling/importance_sampling_ratio/mean": 0.9860107898712158, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 0.9601047039031982, + "sampling/sampling_logp_difference/mean": 0.052328821271657944, + "step": 34, + "step_time": 10.76481853400037 + }, + { + "clip_ratio/high_max": 0.00657894741743803, + "clip_ratio/high_mean": 0.003289473708719015, + "clip_ratio/low_mean": 0.008878070977516472, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.012167544686235487, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2801.0, + "completions/max_terminated_length": 2801.0, + "completions/mean_length": 1756.46875, + "completions/mean_terminated_length": 1756.46875, + "completions/min_length": 1052.0, + "completions/min_terminated_length": 1052.0, + "entropy": 0.38564804941415787, + "epoch": 0.0028, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5950710773468018, + "kl": 0.0196278584189713, + "learning_rate": 5.8285714285714284e-06, + "loss": 0.0794, + "num_tokens": 2629098.0, + "reward": 0.4750000238418579, + "reward_std": 0.26933756470680237, + "rewards/rollout_reward_func/mean": 0.4750000238418579, + "rewards/rollout_reward_func/std": 0.40420371294021606, + "sampling/importance_sampling_ratio/max": 2.8944315910339355, + "sampling/importance_sampling_ratio/mean": 1.212613582611084, + "sampling/importance_sampling_ratio/min": 0.3920697867870331, + "sampling/sampling_logp_difference/max": 0.7614344358444214, + "sampling/sampling_logp_difference/mean": 0.050811417400836945, + "step": 35, + "step_time": 12.117880444999855 + }, + { + "clip_ratio/high_max": 0.032855731435120106, + "clip_ratio/high_mean": 0.008213932858780026, + "clip_ratio/low_mean": 0.008068988332524896, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.016282920725643635, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2819.0, + "completions/max_terminated_length": 2819.0, + "completions/mean_length": 2214.375, + "completions/mean_terminated_length": 2214.375, + "completions/min_length": 1579.0, + "completions/min_terminated_length": 1579.0, + "entropy": 0.4132639244198799, + "epoch": 0.00288, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4248710870742798, + "kl": 0.04949819762259722, + "learning_rate": 6e-06, + "loss": -0.1152, + "num_tokens": 2713433.0, + "reward": 0.3043749928474426, + "reward_std": 0.08011817932128906, + "rewards/rollout_reward_func/mean": 0.3043749928474426, + "rewards/rollout_reward_func/std": 0.16871310770511627, + "sampling/importance_sampling_ratio/max": 2.279515504837036, + "sampling/importance_sampling_ratio/mean": 1.0208816528320312, + "sampling/importance_sampling_ratio/min": 0.2197788804769516, + "sampling/sampling_logp_difference/max": 1.5309280157089233, + "sampling/sampling_logp_difference/mean": 0.05491582304239273, + "step": 36, + "step_time": 13.165009270000155 + }, + { + "clip_ratio/high_max": 0.01785714365541935, + "clip_ratio/high_mean": 0.004464285913854837, + "clip_ratio/low_mean": 0.0022321429569274187, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.006696428870782256, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2803.0, + "completions/max_terminated_length": 2803.0, + "completions/mean_length": 1736.1875, + "completions/mean_terminated_length": 1736.1875, + "completions/min_length": 1052.0, + "completions/min_terminated_length": 1052.0, + "entropy": 0.3515569269657135, + "epoch": 0.00296, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0670298337936401, + "kl": 0.025617226026952267, + "learning_rate": 5.999999982184864e-06, + "loss": 0.0221, + "num_tokens": 2780777.0, + "reward": 0.4387500286102295, + "reward_std": 0.25966876745224, + "rewards/rollout_reward_func/mean": 0.4387500286102295, + "rewards/rollout_reward_func/std": 0.3832606077194214, + "sampling/importance_sampling_ratio/max": 2.3271644115448, + "sampling/importance_sampling_ratio/mean": 1.0649113655090332, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 1.0678925514221191, + "sampling/sampling_logp_difference/mean": 0.05666026473045349, + "step": 37, + "step_time": 12.593250806000015 + }, + { + "clip_ratio/high_max": 0.028383397962898016, + "clip_ratio/high_mean": 0.010161041049286723, + "clip_ratio/low_mean": 0.006483843666501343, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.016644884599372745, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2777.0, + "completions/max_terminated_length": 2777.0, + "completions/mean_length": 1819.5625, + "completions/mean_terminated_length": 1819.5625, + "completions/min_length": 1056.0, + "completions/min_terminated_length": 1056.0, + "entropy": 0.38034912198781967, + "epoch": 0.00304, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.0448880195617676, + "kl": 0.04296189732849598, + "learning_rate": 5.999999928739459e-06, + "loss": -0.0115, + "num_tokens": 2851032.0, + "reward": 0.6024999618530273, + "reward_std": 0.2617889940738678, + "rewards/rollout_reward_func/mean": 0.6024999618530273, + "rewards/rollout_reward_func/std": 0.44098126888275146, + "sampling/importance_sampling_ratio/max": 2.681164503097534, + "sampling/importance_sampling_ratio/mean": 1.0418896675109863, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 1.4294462203979492, + "sampling/sampling_logp_difference/mean": 0.0609976202249527, + "step": 38, + "step_time": 12.55964067500031 + }, + { + "clip_ratio/high_max": 0.047167123295366764, + "clip_ratio/high_mean": 0.014736625598743558, + "clip_ratio/low_mean": 0.004429678898304701, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.01916630449704826, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2820.0, + "completions/max_terminated_length": 2820.0, + "completions/mean_length": 2000.0, + "completions/mean_terminated_length": 2000.0, + "completions/min_length": 1055.0, + "completions/min_terminated_length": 1055.0, + "entropy": 0.4035666435956955, + "epoch": 0.00312, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.904247760772705, + "kl": 0.03608058113604784, + "learning_rate": 5.999999839663784e-06, + "loss": -0.1975, + "num_tokens": 2927712.0, + "reward": 0.3853124976158142, + "reward_std": 0.1657649129629135, + "rewards/rollout_reward_func/mean": 0.3853124976158142, + "rewards/rollout_reward_func/std": 0.31012988090515137, + "sampling/importance_sampling_ratio/max": 2.3516104221343994, + "sampling/importance_sampling_ratio/mean": 0.8599222898483276, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 1.4187037944793701, + "sampling/sampling_logp_difference/mean": 0.05978023633360863, + "step": 39, + "step_time": 12.440508590000036 + }, + { + "clip_ratio/high_max": 0.04069459065794945, + "clip_ratio/high_mean": 0.017941734986379743, + "clip_ratio/low_mean": 0.0016447368543595076, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.01958647184073925, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2423.0, + "completions/max_terminated_length": 2423.0, + "completions/mean_length": 1889.0625, + "completions/mean_terminated_length": 1889.0625, + "completions/min_length": 1054.0, + "completions/min_terminated_length": 1054.0, + "entropy": 0.42887038737535477, + "epoch": 0.0032, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.507852077484131, + "kl": 0.031137569807469845, + "learning_rate": 5.99999971495784e-06, + "loss": -0.0375, + "num_tokens": 3000212.0, + "reward": 0.38593751192092896, + "reward_std": 0.16842570900917053, + "rewards/rollout_reward_func/mean": 0.38593751192092896, + "rewards/rollout_reward_func/std": 0.35313212871551514, + "sampling/importance_sampling_ratio/max": 1.8619109392166138, + "sampling/importance_sampling_ratio/mean": 0.8876512050628662, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 0.8854889869689941, + "sampling/sampling_logp_difference/mean": 0.0671561062335968, + "step": 40, + "step_time": 11.693177195999851 + }, + { + "clip_ratio/high_max": 0.02651259582489729, + "clip_ratio/high_mean": 0.006628148956224322, + "clip_ratio/low_mean": 0.0017361111240461469, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00836426008027047, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2789.0, + "completions/max_terminated_length": 2789.0, + "completions/mean_length": 2136.03125, + "completions/mean_terminated_length": 2136.03125, + "completions/min_length": 1053.0, + "completions/min_terminated_length": 1053.0, + "entropy": 0.42095063626766205, + "epoch": 0.00328, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.2850134372711182, + "kl": 0.039208856876939535, + "learning_rate": 5.99999955462163e-06, + "loss": -0.0237, + "num_tokens": 3081651.0, + "reward": 0.3506249785423279, + "reward_std": 0.1440507173538208, + "rewards/rollout_reward_func/mean": 0.3506249785423279, + "rewards/rollout_reward_func/std": 0.2683153748512268, + "sampling/importance_sampling_ratio/max": 2.8166987895965576, + "sampling/importance_sampling_ratio/mean": 1.0108704566955566, + "sampling/importance_sampling_ratio/min": 0.14420194923877716, + "sampling/sampling_logp_difference/max": 1.127936840057373, + "sampling/sampling_logp_difference/mean": 0.06519916653633118, + "step": 41, + "step_time": 14.135176596000292 + }, + { + "clip_ratio/high_max": 0.03996024373918772, + "clip_ratio/high_mean": 0.012911256635561585, + "clip_ratio/low_mean": 0.004817708395421505, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.01772896503098309, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2767.0, + "completions/max_terminated_length": 2767.0, + "completions/mean_length": 1934.65625, + "completions/mean_terminated_length": 1934.65625, + "completions/min_length": 1055.0, + "completions/min_terminated_length": 1055.0, + "entropy": 0.38335342705249786, + "epoch": 0.00336, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.1722676753997803, + "kl": 0.13585597835481167, + "learning_rate": 5.999999358655157e-06, + "loss": -0.2418, + "num_tokens": 3156023.0, + "reward": 0.3475000262260437, + "reward_std": 0.21655070781707764, + "rewards/rollout_reward_func/mean": 0.3475000262260437, + "rewards/rollout_reward_func/std": 0.3131937086582184, + "sampling/importance_sampling_ratio/max": 2.6130497455596924, + "sampling/importance_sampling_ratio/mean": 0.8806287050247192, + "sampling/importance_sampling_ratio/min": 0.16678351163864136, + "sampling/sampling_logp_difference/max": 2.3499860763549805, + "sampling/sampling_logp_difference/mean": 0.06342820823192596, + "step": 42, + "step_time": 13.112628190999885 + }, + { + "clip_ratio/high_max": 0.021321472711861134, + "clip_ratio/high_mean": 0.007562511134892702, + "clip_ratio/low_mean": 0.0038768798112869263, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.011439391179010272, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2797.0, + "completions/max_terminated_length": 2797.0, + "completions/mean_length": 1674.3125, + "completions/mean_terminated_length": 1674.3125, + "completions/min_length": 1054.0, + "completions/min_terminated_length": 1054.0, + "entropy": 0.3885280713438988, + "epoch": 0.00344, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4752204418182373, + "kl": 0.036413189955055714, + "learning_rate": 5.999999127058423e-06, + "loss": 0.0258, + "num_tokens": 3221611.0, + "reward": 0.6737500429153442, + "reward_std": 0.25966876745224, + "rewards/rollout_reward_func/mean": 0.6737500429153442, + "rewards/rollout_reward_func/std": 0.4556862711906433, + "sampling/importance_sampling_ratio/max": 2.9477226734161377, + "sampling/importance_sampling_ratio/mean": 1.1396255493164062, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 1.167872667312622, + "sampling/sampling_logp_difference/mean": 0.06658157706260681, + "step": 43, + "step_time": 12.020586962000152 + }, + { + "clip_ratio/high_max": 0.036011905409395695, + "clip_ratio/high_mean": 0.010423430823720992, + "clip_ratio/low_mean": 0.0030159883899614215, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.013439419795759022, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2814.0, + "completions/max_terminated_length": 2814.0, + "completions/mean_length": 2095.75, + "completions/mean_terminated_length": 2095.75, + "completions/min_length": 1568.0, + "completions/min_terminated_length": 1568.0, + "entropy": 0.39560940861701965, + "epoch": 0.00352, + "frac_reward_zero_std": 0.125, + "grad_norm": 1.8694807291030884, + "kl": 0.1402588039636612, + "learning_rate": 5.999998859831431e-06, + "loss": -0.1597, + "num_tokens": 3301324.0, + "reward": 0.40437501668930054, + "reward_std": 0.2259407639503479, + "rewards/rollout_reward_func/mean": 0.40437501668930054, + "rewards/rollout_reward_func/std": 0.35422733426094055, + "sampling/importance_sampling_ratio/max": 2.6974401473999023, + "sampling/importance_sampling_ratio/mean": 0.8676252365112305, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 2.911269187927246, + "sampling/sampling_logp_difference/mean": 0.08191373944282532, + "step": 44, + "step_time": 12.868387047999704 + }, + { + "clip_ratio/high_max": 0.0369886364787817, + "clip_ratio/high_mean": 0.011032873298972845, + "clip_ratio/low_mean": 0.0043535883305594325, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015386461513116956, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2793.0, + "completions/max_terminated_length": 2793.0, + "completions/mean_length": 2412.75, + "completions/mean_terminated_length": 2412.75, + "completions/min_length": 1056.0, + "completions/min_terminated_length": 1056.0, + "entropy": 0.4342958629131317, + "epoch": 0.0036, + "frac_reward_zero_std": 0.375, + "grad_norm": 1.422937035560608, + "kl": 0.11194289568811655, + "learning_rate": 5.999998556974188e-06, + "loss": -0.1586, + "num_tokens": 3392626.0, + "reward": 0.35750001668930054, + "reward_std": 0.0949999988079071, + "rewards/rollout_reward_func/mean": 0.35750001668930054, + "rewards/rollout_reward_func/std": 0.260532945394516, + "sampling/importance_sampling_ratio/max": 2.1776068210601807, + "sampling/importance_sampling_ratio/mean": 0.852668285369873, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 3.2368037700653076, + "sampling/sampling_logp_difference/mean": 0.07167594134807587, + "step": 45, + "step_time": 13.40532306199998 + }, + { + "clip_ratio/high_max": 0.036038962192833424, + "clip_ratio/high_mean": 0.012058520689606667, + "clip_ratio/low_mean": 0.0017857142956927419, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.013844234868884087, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2784.0, + "completions/max_terminated_length": 2784.0, + "completions/mean_length": 2018.40625, + "completions/mean_terminated_length": 2018.40625, + "completions/min_length": 1053.0, + "completions/min_terminated_length": 1053.0, + "entropy": 0.3769753500819206, + "epoch": 0.00368, + "frac_reward_zero_std": 0.5, + "grad_norm": 6.942874908447266, + "kl": 0.8322499115020037, + "learning_rate": 5.999998218486697e-06, + "loss": -0.0692, + "num_tokens": 3469989.0, + "reward": 0.39250001311302185, + "reward_std": 0.14825798571109772, + "rewards/rollout_reward_func/mean": 0.39250001311302185, + "rewards/rollout_reward_func/std": 0.29918164014816284, + "sampling/importance_sampling_ratio/max": 2.446554660797119, + "sampling/importance_sampling_ratio/mean": 0.8061342239379883, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 3.015519142150879, + "sampling/sampling_logp_difference/mean": 0.07696790993213654, + "step": 46, + "step_time": 12.274703390000013 + }, + { + "clip_ratio/high_max": 0.04237867519259453, + "clip_ratio/high_mean": 0.01807057624682784, + "clip_ratio/low_mean": 0.005178963067010045, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.023249539081007242, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2800.0, + "completions/max_terminated_length": 2800.0, + "completions/mean_length": 1802.75, + "completions/mean_terminated_length": 1802.75, + "completions/min_length": 1053.0, + "completions/min_terminated_length": 1053.0, + "entropy": 0.38206612318754196, + "epoch": 0.00376, + "frac_reward_zero_std": 0.375, + "grad_norm": 1.6050693988800049, + "kl": 0.05531273875385523, + "learning_rate": 5.999997844368963e-06, + "loss": -0.0113, + "num_tokens": 3540097.0, + "reward": 0.4990624785423279, + "reward_std": 0.28371256589889526, + "rewards/rollout_reward_func/mean": 0.4990624785423279, + "rewards/rollout_reward_func/std": 0.41065138578414917, + "sampling/importance_sampling_ratio/max": 1.9599696397781372, + "sampling/importance_sampling_ratio/mean": 0.8884379863739014, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 0.8157303333282471, + "sampling/sampling_logp_difference/mean": 0.06130218505859375, + "step": 47, + "step_time": 12.33328224800016 + }, + { + "clip_ratio/high_max": 0.0206808946095407, + "clip_ratio/high_mean": 0.005170223652385175, + "clip_ratio/low_mean": 0.004861111170612276, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.01003133482299745, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2813.0, + "completions/max_terminated_length": 2813.0, + "completions/mean_length": 1976.21875, + "completions/mean_terminated_length": 1976.21875, + "completions/min_length": 1054.0, + "completions/min_terminated_length": 1054.0, + "entropy": 0.37976498901844025, + "epoch": 0.00384, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5906230211257935, + "kl": 0.11688470654189587, + "learning_rate": 5.999997434620992e-06, + "loss": -0.1357, + "num_tokens": 3616089.0, + "reward": 0.437812477350235, + "reward_std": 0.20705953240394592, + "rewards/rollout_reward_func/mean": 0.437812477350235, + "rewards/rollout_reward_func/std": 0.35220715403556824, + "sampling/importance_sampling_ratio/max": 1.8663876056671143, + "sampling/importance_sampling_ratio/mean": 0.8626433610916138, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 2.65832781791687, + "sampling/sampling_logp_difference/mean": 0.06971758604049683, + "step": 48, + "step_time": 12.541744299999891 + }, + { + "clip_ratio/high_max": 0.012820512987673283, + "clip_ratio/high_mean": 0.0032051282469183207, + "clip_ratio/low_mean": 0.0014534883666783571, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.004658616613596678, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2807.0, + "completions/max_terminated_length": 2807.0, + "completions/mean_length": 2245.15625, + "completions/mean_terminated_length": 2245.15625, + "completions/min_length": 1551.0, + "completions/min_terminated_length": 1551.0, + "entropy": 0.4284479096531868, + "epoch": 0.00392, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0771631002426147, + "kl": 0.046674114651978016, + "learning_rate": 5.999996989242791e-06, + "loss": -0.0014, + "num_tokens": 3701038.0, + "reward": 0.42624998092651367, + "reward_std": 0.13466876745224, + "rewards/rollout_reward_func/mean": 0.42624998092651367, + "rewards/rollout_reward_func/std": 0.3314265012741089, + "sampling/importance_sampling_ratio/max": 1.4823979139328003, + "sampling/importance_sampling_ratio/mean": 0.8060042858123779, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 1.11665940284729, + "sampling/sampling_logp_difference/mean": 0.0697537213563919, + "step": 49, + "step_time": 13.132170692999807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2787.0, + "completions/max_terminated_length": 2787.0, + "completions/mean_length": 2460.71875, + "completions/mean_terminated_length": 2460.71875, + "completions/min_length": 2034.0, + "completions/min_terminated_length": 2034.0, + "entropy": 0.4269709587097168, + "epoch": 0.004, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06742172688245773, + "kl": 0.05730041675269604, + "learning_rate": 5.999996508234369e-06, + "loss": 0.0008, + "num_tokens": 3793655.0, + "reward": 0.30000001192092896, + "reward_std": 0.0, + "rewards/rollout_reward_func/mean": 0.30000001192092896, + "rewards/rollout_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.4013469219207764, + "sampling/importance_sampling_ratio/mean": 0.8128387928009033, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 1.065826416015625, + "sampling/sampling_logp_difference/mean": 0.07448764890432358, + "step": 50, + "step_time": 13.017520340999681 + }, + { + "clip_ratio/high_max": 0.03630952490493655, + "clip_ratio/high_mean": 0.012549603707157075, + "clip_ratio/low_mean": 0.0031565657118335366, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015706169069744647, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2785.0, + "completions/max_terminated_length": 2785.0, + "completions/mean_length": 1773.78125, + "completions/mean_terminated_length": 1773.78125, + "completions/min_length": 1054.0, + "completions/min_terminated_length": 1054.0, + "entropy": 0.37685880810022354, + "epoch": 0.00408, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.299412727355957, + "kl": 0.04002719838172197, + "learning_rate": 5.999995991595729e-06, + "loss": -0.0109, + "num_tokens": 3862448.0, + "reward": 0.5353125333786011, + "reward_std": 0.08654377609491348, + "rewards/rollout_reward_func/mean": 0.5353125333786011, + "rewards/rollout_reward_func/std": 0.41608762741088867, + "sampling/importance_sampling_ratio/max": 2.3817224502563477, + "sampling/importance_sampling_ratio/mean": 0.9811595678329468, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 1.0328466892242432, + "sampling/sampling_logp_difference/mean": 0.06913870573043823, + "step": 51, + "step_time": 12.60499654799969 + }, + { + "clip_ratio/high_max": 0.03819444449618459, + "clip_ratio/high_mean": 0.015144050237722695, + "clip_ratio/low_mean": 0.00554396363440901, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.02068801363930106, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2784.0, + "completions/max_terminated_length": 2784.0, + "completions/mean_length": 1786.09375, + "completions/mean_terminated_length": 1786.09375, + "completions/min_length": 1053.0, + "completions/min_terminated_length": 1053.0, + "entropy": 0.38082515448331833, + "epoch": 0.00416, + "frac_reward_zero_std": 0.375, + "grad_norm": 1.5871905088424683, + "kl": 0.06744291074573994, + "learning_rate": 5.999995439326883e-06, + "loss": -0.0699, + "num_tokens": 3931876.0, + "reward": 0.6090624928474426, + "reward_std": 0.26599711179733276, + "rewards/rollout_reward_func/mean": 0.6090624928474426, + "rewards/rollout_reward_func/std": 0.4591953456401825, + "sampling/importance_sampling_ratio/max": 2.734297752380371, + "sampling/importance_sampling_ratio/mean": 0.9665597677230835, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 2.065897226333618, + "sampling/sampling_logp_difference/mean": 0.06354629993438721, + "step": 52, + "step_time": 13.568785395000077 + }, + { + "clip_ratio/high_max": 0.022086466662585735, + "clip_ratio/high_mean": 0.008820227812975645, + "clip_ratio/low_mean": 0.0057043652050197124, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.01452459313441068, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2789.0, + "completions/max_terminated_length": 2789.0, + "completions/mean_length": 1635.03125, + "completions/mean_terminated_length": 1635.03125, + "completions/min_length": 1053.0, + "completions/min_terminated_length": 1053.0, + "entropy": 0.36858493834733963, + "epoch": 0.00424, + "frac_reward_zero_std": 0.375, + "grad_norm": 1.9566222429275513, + "kl": 0.07718627620488405, + "learning_rate": 5.999994851427837e-06, + "loss": 0.0822, + "num_tokens": 3995868.0, + "reward": 0.6918749809265137, + "reward_std": 0.3203721046447754, + "rewards/rollout_reward_func/mean": 0.6918749809265137, + "rewards/rollout_reward_func/std": 0.4697249233722687, + "sampling/importance_sampling_ratio/max": 2.7838289737701416, + "sampling/importance_sampling_ratio/mean": 0.9136906266212463, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 1.7872750759124756, + "sampling/sampling_logp_difference/mean": 0.07199069857597351, + "step": 53, + "step_time": 12.247058065999909 + }, + { + "clip_ratio/high_max": 0.041652148589491844, + "clip_ratio/high_mean": 0.013425522716715932, + "clip_ratio/low_mean": 0.01002952002454549, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234550426248461, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2784.0, + "completions/max_terminated_length": 2784.0, + "completions/mean_length": 1590.09375, + "completions/mean_terminated_length": 1590.09375, + "completions/min_length": 1053.0, + "completions/min_terminated_length": 1053.0, + "entropy": 0.37869949638843536, + "epoch": 0.00432, + "frac_reward_zero_std": 0.125, + "grad_norm": 2.3980886936187744, + "kl": 0.05773049034178257, + "learning_rate": 5.999994227898604e-06, + "loss": -0.0192, + "num_tokens": 4058303.0, + "reward": 0.4609374701976776, + "reward_std": 0.35279375314712524, + "rewards/rollout_reward_func/mean": 0.4609374701976776, + "rewards/rollout_reward_func/std": 0.44058871269226074, + "sampling/importance_sampling_ratio/max": 2.2311129570007324, + "sampling/importance_sampling_ratio/mean": 0.9393452405929565, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 0.9994411468505859, + "sampling/sampling_logp_difference/mean": 0.08165294677019119, + "step": 54, + "step_time": 11.528886767999893 + }, + { + "clip_ratio/high_max": 0.02447916753590107, + "clip_ratio/high_mean": 0.0075732802506536245, + "clip_ratio/low_mean": 0.00947712454944849, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.01705040503293276, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2444.0, + "completions/max_terminated_length": 2444.0, + "completions/mean_length": 1789.75, + "completions/mean_terminated_length": 1789.75, + "completions/min_length": 1055.0, + "completions/min_terminated_length": 1055.0, + "entropy": 0.36519913375377655, + "epoch": 0.0044, + "frac_reward_zero_std": 0.375, + "grad_norm": 2.2972187995910645, + "kl": 0.05506392475217581, + "learning_rate": 5.99999356873919e-06, + "loss": -0.1185, + "num_tokens": 4127411.0, + "reward": 0.40562498569488525, + "reward_std": 0.22391541302204132, + "rewards/rollout_reward_func/mean": 0.40562498569488525, + "rewards/rollout_reward_func/std": 0.3422500193119049, + "sampling/importance_sampling_ratio/max": 2.4115021228790283, + "sampling/importance_sampling_ratio/mean": 0.9461013674736023, + "sampling/importance_sampling_ratio/min": 0.14957794547080994, + "sampling/sampling_logp_difference/max": 1.0122857093811035, + "sampling/sampling_logp_difference/mean": 0.06259442120790482, + "step": 55, + "step_time": 11.493340414999693 + }, + { + "clip_ratio/high_max": 0.0438775522634387, + "clip_ratio/high_mean": 0.012457483448088169, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.012457483448088169, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2799.0, + "completions/max_terminated_length": 2799.0, + "completions/mean_length": 2306.375, + "completions/mean_terminated_length": 2306.375, + "completions/min_length": 1567.0, + "completions/min_terminated_length": 1567.0, + "entropy": 0.40949854254722595, + "epoch": 0.00448, + "frac_reward_zero_std": 0.375, + "grad_norm": 2.023374319076538, + "kl": 0.08688413165509701, + "learning_rate": 5.999992873949609e-06, + "loss": -0.0712, + "num_tokens": 4214487.0, + "reward": 0.296875, + "reward_std": 0.08874999731779099, + "rewards/rollout_reward_func/mean": 0.296875, + "rewards/rollout_reward_func/std": 0.15228237211704254, + "sampling/importance_sampling_ratio/max": 2.9896316528320312, + "sampling/importance_sampling_ratio/mean": 0.968756377696991, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 1.835113763809204, + "sampling/sampling_logp_difference/mean": 0.08090537041425705, + "step": 56, + "step_time": 13.222404719000224 + }, + { + "clip_ratio/high_max": 0.04506416339427233, + "clip_ratio/high_mean": 0.01424223161302507, + "clip_ratio/low_mean": 0.002842377289198339, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.01708460901863873, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2441.0, + "completions/max_terminated_length": 2441.0, + "completions/mean_length": 1963.0, + "completions/mean_terminated_length": 1963.0, + "completions/min_length": 1052.0, + "completions/min_terminated_length": 1052.0, + "entropy": 0.4080217182636261, + "epoch": 0.00456, + "frac_reward_zero_std": 0.375, + "grad_norm": 1.663516640663147, + "kl": 0.3106076046824455, + "learning_rate": 5.999992143529868e-06, + "loss": -0.0796, + "num_tokens": 4289619.0, + "reward": 0.3934375047683716, + "reward_std": 0.1563829779624939, + "rewards/rollout_reward_func/mean": 0.3934375047683716, + "rewards/rollout_reward_func/std": 0.30592650175094604, + "sampling/importance_sampling_ratio/max": 1.4833005666732788, + "sampling/importance_sampling_ratio/mean": 0.5795140862464905, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 3.2869999408721924, + "sampling/sampling_logp_difference/mean": 0.0979442298412323, + "step": 57, + "step_time": 11.762371722000125 + }, + { + "clip_ratio/high_max": 0.046875, + "clip_ratio/high_mean": 0.01171875, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.01171875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2802.0, + "completions/max_terminated_length": 2802.0, + "completions/mean_length": 1879.75, + "completions/mean_terminated_length": 1879.75, + "completions/min_length": 1053.0, + "completions/min_terminated_length": 1053.0, + "entropy": 0.3892976716160774, + "epoch": 0.00464, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.1009422540664673, + "kl": 0.05097049381583929, + "learning_rate": 5.999991377479982e-06, + "loss": -0.0191, + "num_tokens": 4362090.0, + "reward": 0.5262500047683716, + "reward_std": 0.1875, + "rewards/rollout_reward_func/mean": 0.5262500047683716, + "rewards/rollout_reward_func/std": 0.4009806215763092, + "sampling/importance_sampling_ratio/max": 2.9964590072631836, + "sampling/importance_sampling_ratio/mean": 1.0218505859375, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 1.0378296375274658, + "sampling/sampling_logp_difference/mean": 0.06926104426383972, + "step": 58, + "step_time": 13.15352440300012 + }, + { + "clip_ratio/high_max": 0.019571688026189804, + "clip_ratio/high_mean": 0.004892922006547451, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.004892922006547451, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2832.0, + "completions/max_terminated_length": 2832.0, + "completions/mean_length": 2202.375, + "completions/mean_terminated_length": 2202.375, + "completions/min_length": 1052.0, + "completions/min_terminated_length": 1052.0, + "entropy": 0.41450754553079605, + "epoch": 0.00472, + "frac_reward_zero_std": 0.625, + "grad_norm": 2.1054370403289795, + "kl": 0.03926007356494665, + "learning_rate": 5.999990575799961e-06, + "loss": 0.0595, + "num_tokens": 4446012.0, + "reward": 0.44343751668930054, + "reward_std": 0.13312500715255737, + "rewards/rollout_reward_func/mean": 0.44343751668930054, + "rewards/rollout_reward_func/std": 0.3428213894367218, + "sampling/importance_sampling_ratio/max": 2.236393928527832, + "sampling/importance_sampling_ratio/mean": 0.8590089678764343, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 0.9236248731613159, + "sampling/sampling_logp_difference/mean": 0.06904841959476471, + "step": 59, + "step_time": 13.58062329900008 + }, + { + "clip_ratio/high_max": 0.029240576550364494, + "clip_ratio/high_mean": 0.007310144137591124, + "clip_ratio/low_mean": 0.0030868902103975415, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.010397034231573343, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2429.0, + "completions/max_terminated_length": 2429.0, + "completions/mean_length": 2124.9375, + "completions/mean_terminated_length": 2124.9375, + "completions/min_length": 1567.0, + "completions/min_terminated_length": 1567.0, + "entropy": 0.370839923620224, + "epoch": 0.0048, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.5435988903045654, + "kl": 0.08518982026726007, + "learning_rate": 5.99998973848982e-06, + "loss": -0.0579, + "num_tokens": 4527051.0, + "reward": 0.3590624928474426, + "reward_std": 0.06796419620513916, + "rewards/rollout_reward_func/mean": 0.3590624928474426, + "rewards/rollout_reward_func/std": 0.22809672355651855, + "sampling/importance_sampling_ratio/max": 2.2381787300109863, + "sampling/importance_sampling_ratio/mean": 0.8449472188949585, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 1.6413207054138184, + "sampling/sampling_logp_difference/mean": 0.069917693734169, + "step": 60, + "step_time": 11.73221161399988 + }, + { + "clip_ratio/high_max": 0.02281746082007885, + "clip_ratio/high_mean": 0.006954365293495357, + "clip_ratio/low_mean": 0.0037499999161809683, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.010704364976845682, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2767.0, + "completions/max_terminated_length": 2767.0, + "completions/mean_length": 1689.3125, + "completions/mean_terminated_length": 1689.3125, + "completions/min_length": 1052.0, + "completions/min_terminated_length": 1052.0, + "entropy": 0.3677019253373146, + "epoch": 0.00488, + "frac_reward_zero_std": 0.375, + "grad_norm": 1.9829214811325073, + "kl": 0.058779667131602764, + "learning_rate": 5.999988865549569e-06, + "loss": 0.0304, + "num_tokens": 4593095.0, + "reward": 0.6549999713897705, + "reward_std": 0.22813192009925842, + "rewards/rollout_reward_func/mean": 0.6549999713897705, + "rewards/rollout_reward_func/std": 0.45271220803260803, + "sampling/importance_sampling_ratio/max": 1.883159875869751, + "sampling/importance_sampling_ratio/mean": 0.8463116884231567, + "sampling/importance_sampling_ratio/min": 0.22824469208717346, + "sampling/sampling_logp_difference/max": 1.781625747680664, + "sampling/sampling_logp_difference/mean": 0.06828776746988297, + "step": 61, + "step_time": 12.480462479000153 + }, + { + "clip_ratio/high_max": 0.0110975606366992, + "clip_ratio/high_mean": 0.0027743901591748, + "clip_ratio/low_mean": 0.0014880952658131719, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.004262485424987972, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2815.0, + "completions/max_terminated_length": 2815.0, + "completions/mean_length": 2305.15625, + "completions/mean_terminated_length": 2305.15625, + "completions/min_length": 1571.0, + "completions/min_terminated_length": 1571.0, + "entropy": 0.4012472406029701, + "epoch": 0.00496, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8583229780197144, + "kl": 0.06238031107932329, + "learning_rate": 5.999987956979225e-06, + "loss": -0.0392, + "num_tokens": 4680377.0, + "reward": 0.3434374928474426, + "reward_std": 0.08029377460479736, + "rewards/rollout_reward_func/mean": 0.3434374928474426, + "rewards/rollout_reward_func/std": 0.22245851159095764, + "sampling/importance_sampling_ratio/max": 2.7665059566497803, + "sampling/importance_sampling_ratio/mean": 0.9882571697235107, + "sampling/importance_sampling_ratio/min": 0.059778764843940735, + "sampling/sampling_logp_difference/max": 0.9543299674987793, + "sampling/sampling_logp_difference/mean": 0.0674777626991272, + "step": 62, + "step_time": 14.56621960199982 + }, + { + "clip_ratio/high_max": 0.05253623379394412, + "clip_ratio/high_mean": 0.016606280929408967, + "clip_ratio/low_mean": 0.0018382353009656072, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.018444516230374575, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2423.0, + "completions/max_terminated_length": 2423.0, + "completions/mean_length": 1968.9375, + "completions/mean_terminated_length": 1968.9375, + "completions/min_length": 1052.0, + "completions/min_terminated_length": 1052.0, + "entropy": 0.4196172505617142, + "epoch": 0.00504, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.905213475227356, + "kl": 0.09828684013336897, + "learning_rate": 5.999987012778799e-06, + "loss": -0.0034, + "num_tokens": 4755993.0, + "reward": 0.33031249046325684, + "reward_std": 0.09654378145933151, + "rewards/rollout_reward_func/mean": 0.33031249046325684, + "rewards/rollout_reward_func/std": 0.21877197921276093, + "sampling/importance_sampling_ratio/max": 2.0344085693359375, + "sampling/importance_sampling_ratio/mean": 0.7417819499969482, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 2.208054542541504, + "sampling/sampling_logp_difference/mean": 0.07868118584156036, + "step": 63, + "step_time": 11.736785162999922 + }, + { + "clip_ratio/high_max": 0.0055555556900799274, + "clip_ratio/high_mean": 0.0027777778450399637, + "clip_ratio/low_mean": 0.003794643096625805, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.006572420941665769, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2814.0, + "completions/max_terminated_length": 2814.0, + "completions/mean_length": 1972.9375, + "completions/mean_terminated_length": 1972.9375, + "completions/min_length": 1053.0, + "completions/min_terminated_length": 1053.0, + "entropy": 0.4023704081773758, + "epoch": 0.00512, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3090136051177979, + "kl": 0.12237261980772018, + "learning_rate": 5.9999860329483104e-06, + "loss": -0.194, + "num_tokens": 4831827.0, + "reward": 0.5878125429153442, + "reward_std": 0.142506942152977, + "rewards/rollout_reward_func/mean": 0.5878125429153442, + "rewards/rollout_reward_func/std": 0.4488244950771332, + "sampling/importance_sampling_ratio/max": 2.9826838970184326, + "sampling/importance_sampling_ratio/mean": 0.9495848417282104, + "sampling/importance_sampling_ratio/min": 0.01580546610057354, + "sampling/sampling_logp_difference/max": 2.0490379333496094, + "sampling/sampling_logp_difference/mean": 0.07465916872024536, + "step": 64, + "step_time": 12.846850890000042 + }, + { + "clip_ratio/high_max": 0.021152781788259745, + "clip_ratio/high_mean": 0.006812585634179413, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.006812585634179413, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2800.0, + "completions/max_terminated_length": 2800.0, + "completions/mean_length": 2005.0, + "completions/mean_terminated_length": 2005.0, + "completions/min_length": 1054.0, + "completions/min_terminated_length": 1054.0, + "entropy": 0.40342626720666885, + "epoch": 0.0052, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.2556148767471313, + "kl": 0.0746797863394022, + "learning_rate": 5.999985017487771e-06, + "loss": -0.0305, + "num_tokens": 4908716.0, + "reward": 0.3818749785423279, + "reward_std": 0.15371949970722198, + "rewards/rollout_reward_func/mean": 0.3818749785423279, + "rewards/rollout_reward_func/std": 0.3037022650241852, + "sampling/importance_sampling_ratio/max": 2.2475836277008057, + "sampling/importance_sampling_ratio/mean": 0.8594139814376831, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 1.4058151245117188, + "sampling/sampling_logp_difference/mean": 0.06716296076774597, + "step": 65, + "step_time": 12.423915739000222 + }, + { + "clip_ratio/high_max": 0.03018707549199462, + "clip_ratio/high_mean": 0.007546768872998655, + "clip_ratio/low_mean": 0.006225198740139604, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.013771967613138258, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2792.0, + "completions/max_terminated_length": 2792.0, + "completions/mean_length": 2048.40625, + "completions/mean_terminated_length": 2048.40625, + "completions/min_length": 1055.0, + "completions/min_terminated_length": 1055.0, + "entropy": 0.38859760761260986, + "epoch": 0.00528, + "frac_reward_zero_std": 0.375, + "grad_norm": 2.1451351642608643, + "kl": 0.22367357090115547, + "learning_rate": 5.999983966397197e-06, + "loss": -0.1677, + "num_tokens": 4987207.0, + "reward": 0.4712499976158142, + "reward_std": 0.18434235453605652, + "rewards/rollout_reward_func/mean": 0.4712499976158142, + "rewards/rollout_reward_func/std": 0.35322248935699463, + "sampling/importance_sampling_ratio/max": 2.9422554969787598, + "sampling/importance_sampling_ratio/mean": 0.9915428161621094, + "sampling/importance_sampling_ratio/min": 0.016011416912078857, + "sampling/sampling_logp_difference/max": 2.497363805770874, + "sampling/sampling_logp_difference/mean": 0.07474374771118164, + "step": 66, + "step_time": 12.926716641999974 + }, + { + "clip_ratio/high_max": 0.01376319769769907, + "clip_ratio/high_mean": 0.005043363547883928, + "clip_ratio/low_mean": 0.0014880952658131719, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0065314588136971, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2783.0, + "completions/max_terminated_length": 2783.0, + "completions/mean_length": 1772.40625, + "completions/mean_terminated_length": 1772.40625, + "completions/min_length": 1055.0, + "completions/min_terminated_length": 1055.0, + "entropy": 0.4034885838627815, + "epoch": 0.00536, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.5488033294677734, + "kl": 0.05104802828282118, + "learning_rate": 5.999982879676608e-06, + "loss": -0.041, + "num_tokens": 5055760.0, + "reward": 0.5737500190734863, + "reward_std": 0.16325795650482178, + "rewards/rollout_reward_func/mean": 0.5737500190734863, + "rewards/rollout_reward_func/std": 0.40885162353515625, + "sampling/importance_sampling_ratio/max": 2.278799057006836, + "sampling/importance_sampling_ratio/mean": 0.99635910987854, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 0.8835396766662598, + "sampling/sampling_logp_difference/mean": 0.06713330745697021, + "step": 67, + "step_time": 12.46692701400002 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.004360465100035071, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.004360465100035071, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2796.0, + "completions/max_terminated_length": 2796.0, + "completions/mean_length": 2270.40625, + "completions/mean_terminated_length": 2270.40625, + "completions/min_length": 1983.0, + "completions/min_terminated_length": 1983.0, + "entropy": 0.4274456053972244, + "epoch": 0.00544, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8523308038711548, + "kl": 0.13869191519916058, + "learning_rate": 5.9999817573260195e-06, + "loss": -0.1124, + "num_tokens": 5141713.0, + "reward": 0.2878125011920929, + "reward_std": 0.01750694215297699, + "rewards/rollout_reward_func/mean": 0.2878125011920929, + "rewards/rollout_reward_func/std": 0.03849880024790764, + "sampling/importance_sampling_ratio/max": 2.765258550643921, + "sampling/importance_sampling_ratio/mean": 0.8237208127975464, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 1.7204997539520264, + "sampling/sampling_logp_difference/mean": 0.08386299759149551, + "step": 68, + "step_time": 12.989918530000296 + }, + { + "clip_ratio/high_max": 0.04849738674238324, + "clip_ratio/high_mean": 0.016489425906911492, + "clip_ratio/low_mean": 0.0013888889225199819, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.017878314713016152, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2441.0, + "completions/max_terminated_length": 2441.0, + "completions/mean_length": 1876.46875, + "completions/mean_terminated_length": 1876.46875, + "completions/min_length": 1054.0, + "completions/min_terminated_length": 1054.0, + "entropy": 0.39091238379478455, + "epoch": 0.00552, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.4318950176239014, + "kl": 0.09389345720410347, + "learning_rate": 5.999980599345448e-06, + "loss": -0.0356, + "num_tokens": 5214177.0, + "reward": 0.5806249976158142, + "reward_std": 0.07874999940395355, + "rewards/rollout_reward_func/mean": 0.5806249976158142, + "rewards/rollout_reward_func/std": 0.4241190552711487, + "sampling/importance_sampling_ratio/max": 1.9308501482009888, + "sampling/importance_sampling_ratio/mean": 0.9257422089576721, + "sampling/importance_sampling_ratio/min": 0.21397040784358978, + "sampling/sampling_logp_difference/max": 1.7655794620513916, + "sampling/sampling_logp_difference/mean": 0.06729073822498322, + "step": 69, + "step_time": 11.706464537000102 + }, + { + "clip_ratio/high_max": 0.012202381156384945, + "clip_ratio/high_mean": 0.0030505952890962362, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0030505952890962362, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2429.0, + "completions/max_terminated_length": 2429.0, + "completions/mean_length": 1901.125, + "completions/mean_terminated_length": 1901.125, + "completions/min_length": 1056.0, + "completions/min_terminated_length": 1056.0, + "entropy": 0.4174434766173363, + "epoch": 0.0056, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.2482589483261108, + "kl": 0.0709009887650609, + "learning_rate": 5.999979405734914e-06, + "loss": -0.0875, + "num_tokens": 5287259.0, + "reward": 0.44999998807907104, + "reward_std": 0.19828803837299347, + "rewards/rollout_reward_func/mean": 0.44999998807907104, + "rewards/rollout_reward_func/std": 0.36232221126556396, + "sampling/importance_sampling_ratio/max": 2.240993022918701, + "sampling/importance_sampling_ratio/mean": 0.7815386652946472, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 1.9302306175231934, + "sampling/sampling_logp_difference/mean": 0.07109043747186661, + "step": 70, + "step_time": 11.92579563199979 + }, + { + "clip_ratio/high_max": 0.03573596617206931, + "clip_ratio/high_mean": 0.011878836317919195, + "clip_ratio/low_mean": 0.0017361111240461469, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.01361494732555002, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2800.0, + "completions/max_terminated_length": 2800.0, + "completions/mean_length": 2164.34375, + "completions/mean_terminated_length": 2164.34375, + "completions/min_length": 1055.0, + "completions/min_terminated_length": 1055.0, + "entropy": 0.4176176115870476, + "epoch": 0.00568, + "frac_reward_zero_std": 0.375, + "grad_norm": 1.604291319847107, + "kl": 0.0694936579093337, + "learning_rate": 5.999978176494435e-06, + "loss": -0.1233, + "num_tokens": 5369772.0, + "reward": 0.4762499928474426, + "reward_std": 0.21075797080993652, + "rewards/rollout_reward_func/mean": 0.4762499928474426, + "rewards/rollout_reward_func/std": 0.3796156644821167, + "sampling/importance_sampling_ratio/max": 2.497992992401123, + "sampling/importance_sampling_ratio/mean": 0.8422503471374512, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 1.5480012893676758, + "sampling/sampling_logp_difference/mean": 0.0728713721036911, + "step": 71, + "step_time": 12.894382659999792 + }, + { + "clip_ratio/high_max": 0.02281746082007885, + "clip_ratio/high_mean": 0.008831319864839315, + "clip_ratio/low_mean": 0.003260501311160624, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.012091821059584618, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2443.0, + "completions/max_terminated_length": 2443.0, + "completions/mean_length": 1719.03125, + "completions/mean_terminated_length": 1719.03125, + "completions/min_length": 1054.0, + "completions/min_terminated_length": 1054.0, + "entropy": 0.39671653509140015, + "epoch": 0.00576, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5779305696487427, + "kl": 0.08434087503701448, + "learning_rate": 5.99997691162403e-06, + "loss": -0.0636, + "num_tokens": 5436596.0, + "reward": 0.6024999618530273, + "reward_std": 0.2729267477989197, + "rewards/rollout_reward_func/mean": 0.6024999618530273, + "rewards/rollout_reward_func/std": 0.4505694806575775, + "sampling/importance_sampling_ratio/max": 2.754258632659912, + "sampling/importance_sampling_ratio/mean": 0.9120872020721436, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 2.2350893020629883, + "sampling/sampling_logp_difference/mean": 0.07831829786300659, + "step": 72, + "step_time": 12.011820702999785 + }, + { + "clip_ratio/high_max": 0.013888888992369175, + "clip_ratio/high_mean": 0.0034722222480922937, + "clip_ratio/low_mean": 0.0017361111240461469, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.005208333372138441, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2780.0, + "completions/max_terminated_length": 2780.0, + "completions/mean_length": 1720.125, + "completions/mean_terminated_length": 1720.125, + "completions/min_length": 1054.0, + "completions/min_terminated_length": 1054.0, + "entropy": 0.35359790176153183, + "epoch": 0.00584, + "frac_reward_zero_std": 0.375, + "grad_norm": 1.760622262954712, + "kl": 0.0876467265188694, + "learning_rate": 5.99997561112372e-06, + "loss": -0.0248, + "num_tokens": 5503333.0, + "reward": 0.7487499713897705, + "reward_std": 0.2987908720970154, + "rewards/rollout_reward_func/mean": 0.7487499713897705, + "rewards/rollout_reward_func/std": 0.46135663986206055, + "sampling/importance_sampling_ratio/max": 2.7877893447875977, + "sampling/importance_sampling_ratio/mean": 0.8998199701309204, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 1.3734312057495117, + "sampling/sampling_logp_difference/mean": 0.07869358360767365, + "step": 73, + "step_time": 13.119324159999906 + }, + { + "clip_ratio/high_max": 0.013701201416552067, + "clip_ratio/high_mean": 0.005070037324912846, + "clip_ratio/low_mean": 0.0030487803742289543, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.008118817582726479, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2825.0, + "completions/max_terminated_length": 2825.0, + "completions/mean_length": 1916.5, + "completions/mean_terminated_length": 1916.5, + "completions/min_length": 1053.0, + "completions/min_terminated_length": 1053.0, + "entropy": 0.38354693353176117, + "epoch": 0.00592, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.8614935874938965, + "kl": 0.05992862023413181, + "learning_rate": 5.999974274993527e-06, + "loss": 0.012, + "num_tokens": 5576952.0, + "reward": 0.5674999952316284, + "reward_std": 0.20719751715660095, + "rewards/rollout_reward_func/mean": 0.5674999952316284, + "rewards/rollout_reward_func/std": 0.4317331612110138, + "sampling/importance_sampling_ratio/max": 1.6715911626815796, + "sampling/importance_sampling_ratio/mean": 0.8377959132194519, + "sampling/importance_sampling_ratio/min": 0.196980819106102, + "sampling/sampling_logp_difference/max": 0.913780689239502, + "sampling/sampling_logp_difference/mean": 0.06950188428163528, + "step": 74, + "step_time": 12.793109332000085 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2785.0, + "completions/max_terminated_length": 2785.0, + "completions/mean_length": 1587.53125, + "completions/mean_terminated_length": 1587.53125, + "completions/min_length": 1054.0, + "completions/min_terminated_length": 1054.0, + "entropy": 0.35842984169721603, + "epoch": 0.006, + "frac_reward_zero_std": 0.375, + "grad_norm": 1.3697266578674316, + "kl": 0.05413582641631365, + "learning_rate": 5.99997290323347e-06, + "loss": -0.0661, + "num_tokens": 5639305.0, + "reward": 0.7106249928474426, + "reward_std": 0.30803900957107544, + "rewards/rollout_reward_func/mean": 0.7106249928474426, + "rewards/rollout_reward_func/std": 0.4498884081840515, + "sampling/importance_sampling_ratio/max": 1.9579815864562988, + "sampling/importance_sampling_ratio/mean": 0.8830677270889282, + "sampling/importance_sampling_ratio/min": 0.15943297743797302, + "sampling/sampling_logp_difference/max": 1.0672590732574463, + "sampling/sampling_logp_difference/mean": 0.06444922834634781, + "step": 75, + "step_time": 12.032428736000156 + }, + { + "epoch": 0.006, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 2488.6, + "eval_completions/max_terminated_length": 2488.6, + "eval_completions/mean_length": 1951.175, + "eval_completions/mean_terminated_length": 1951.175, + "eval_completions/min_length": 1361.4, + "eval_completions/min_terminated_length": 1361.4, + "eval_entropy": 0.37236364781856535, + "eval_frac_reward_zero_std": 0.1, + "eval_kl": 0.07629953697323799, + "eval_loss": -0.0013724860036745667, + "eval_num_tokens": 5639305.0, + "eval_reward": 0.47524999976158144, + "eval_reward_std": 0.37039353847503664, + "eval_rewards/rollout_reward_func/mean": 0.47524999976158144, + "eval_rewards/rollout_reward_func/std": 0.37039353176951406, + "eval_runtime": 10.5117, + "eval_samples_per_second": 0.951, + "eval_sampling/importance_sampling_ratio/max": 1.6587244033813477, + "eval_sampling/importance_sampling_ratio/mean": 0.8837172389030457, + "eval_sampling/importance_sampling_ratio/min": 0.32487900257110597, + "eval_sampling/sampling_logp_difference/max": 0.8625046908855438, + "eval_sampling/sampling_logp_difference/mean": 0.06909476891160012, + "eval_steps_per_second": 0.285, + "step": 75 + } + ], + "logging_steps": 1.0, + "max_steps": 25000, + "num_input_tokens_seen": 5639305, + "num_train_epochs": 2, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}