{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 8320, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.7604166666666666, "completions/max_length": 3000.0, "completions/max_terminated_length": 1854.0, "completions/mean_length": 2497.1875, "completions/mean_terminated_length": 892.4524129231771, "completions/min_length": 358.3333333333333, "completions/min_terminated_length": 358.3333333333333, "entropy": 0.018447159975767135, "epoch": 0.001201923076923077, "frac_reward_zero_std": 0.0, "grad_norm": 0.003163084853440523, "learning_rate": 9.989182692307692e-07, "loss": 0.0005, "num_tokens": 355418.0, "reward": 0.4239015281200409, "reward_std": 0.26765922208627063, "rewards/reward_fn/mean": 0.4239015281200409, "rewards/reward_fn/std": 0.26765922208627063, "sampling/importance_sampling_ratio/max": 1.0162243247032166, "sampling/importance_sampling_ratio/mean": 0.10611778870224953, "sampling/importance_sampling_ratio/min": 3.007131742075823e-05, "sampling/sampling_logp_difference/max": 2.884995619455973, "sampling/sampling_logp_difference/mean": 0.004054587256784241, "step": 10, "step_time": 10.55261356048286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.734375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2669.0, "completions/mean_length": 2536.21875, "completions/mean_terminated_length": 1269.5142822265625, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.021224408783018588, "epoch": 0.002403846153846154, "frac_reward_zero_std": 0.0, "grad_norm": 0.002404542174190283, "learning_rate": 9.977163461538462e-07, "loss": 0.0021, "num_tokens": 579104.0, "reward": 0.4330599159002304, "reward_std": 0.30045658349990845, "rewards/reward_fn/mean": 0.4330599159002304, "rewards/reward_fn/std": 0.30045658349990845, "sampling/importance_sampling_ratio/max": 0.6985206753015518, "sampling/importance_sampling_ratio/mean": 0.06345750391483307, "sampling/importance_sampling_ratio/min": 8.310260227517574e-06, "sampling/sampling_logp_difference/max": 3.6029043197631836, "sampling/sampling_logp_difference/mean": 0.004661130020394921, "step": 20, "step_time": 7.118235421832651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 3000.0, "completions/max_terminated_length": 2110.0, "completions/mean_length": 2410.9895833333335, "completions/mean_terminated_length": 631.1653645833334, "completions/min_length": 157.66666666666666, "completions/min_terminated_length": 157.66666666666666, "entropy": 0.01721692197024822, "epoch": 0.003605769230769231, "frac_reward_zero_std": 0.0, "grad_norm": 0.0009388006874360144, "learning_rate": 9.965144230769231e-07, "loss": -0.0035, "num_tokens": 936943.0, "reward": 0.3777221739292145, "reward_std": 0.2659665991862615, "rewards/reward_fn/mean": 0.3777221739292145, "rewards/reward_fn/std": 0.26596658925215405, "sampling/importance_sampling_ratio/max": 1.0908578038215637, "sampling/importance_sampling_ratio/mean": 0.16680984695752463, "sampling/importance_sampling_ratio/min": 1.0802266084889803e-05, "sampling/sampling_logp_difference/max": 3.912922461827596, "sampling/sampling_logp_difference/mean": 0.00390184810385108, "step": 30, "step_time": 10.352300198748708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2116.5, "completions/mean_length": 2575.625, "completions/mean_terminated_length": 735.0285949707031, "completions/min_length": 102.5, "completions/min_terminated_length": 102.5, "entropy": 0.019362613279372453, "epoch": 0.004807692307692308, "frac_reward_zero_std": 0.0, "grad_norm": 0.004279293119907379, "learning_rate": 9.953125e-07, "loss": -0.0024, "num_tokens": 1170071.0, "reward": 0.4112517237663269, "reward_std": 0.2874909117817879, "rewards/reward_fn/mean": 0.4112517237663269, "rewards/reward_fn/std": 0.2874909043312073, "sampling/importance_sampling_ratio/max": 0.7069350033998489, "sampling/importance_sampling_ratio/mean": 0.09496690332889557, "sampling/importance_sampling_ratio/min": 3.283149942490127e-05, "sampling/sampling_logp_difference/max": 2.9255022406578064, "sampling/sampling_logp_difference/mean": 0.003960304893553257, "step": 40, "step_time": 7.208902366179973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.7395833333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 1785.6666666666667, "completions/mean_length": 2388.9479166666665, "completions/mean_terminated_length": 800.1063944498698, "completions/min_length": 216.33333333333334, "completions/min_terminated_length": 216.33333333333334, "entropy": 0.01809591446071863, "epoch": 0.006009615384615385, "frac_reward_zero_std": 0.0, "grad_norm": 0.0013039779150858521, "learning_rate": 9.941105769230768e-07, "loss": -0.0006, "num_tokens": 1502954.0, "reward": 0.44515878955523175, "reward_std": 0.2775481194257736, "rewards/reward_fn/mean": 0.44515878955523175, "rewards/reward_fn/std": 0.2775481194257736, "sampling/importance_sampling_ratio/max": 0.9713541467984518, "sampling/importance_sampling_ratio/mean": 0.14232338592410088, "sampling/importance_sampling_ratio/min": 7.251124482839562e-05, "sampling/sampling_logp_difference/max": 5.4971126317977905, "sampling/sampling_logp_difference/mean": 0.004008204288159807, "step": 50, "step_time": 10.412021436728537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 3000.0, "completions/max_terminated_length": 2159.0, "completions/mean_length": 2263.359375, "completions/mean_terminated_length": 642.75, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.020646220445632933, "epoch": 0.007211538461538462, "frac_reward_zero_std": 0.0, "grad_norm": 0.0022904141806066036, "learning_rate": 9.92908653846154e-07, "loss": 0.0013, "num_tokens": 1732241.0, "reward": 0.4877139925956726, "reward_std": 0.2718782052397728, "rewards/reward_fn/mean": 0.4877139925956726, "rewards/reward_fn/std": 0.271878182888031, "sampling/importance_sampling_ratio/max": 1.0723138451576233, "sampling/importance_sampling_ratio/mean": 0.17401712387800217, "sampling/importance_sampling_ratio/min": 4.4446743459047866e-05, "sampling/sampling_logp_difference/max": 1.7924824357032776, "sampling/sampling_logp_difference/mean": 0.003943681484088302, "step": 60, "step_time": 7.204357923474163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 3000.0, "completions/max_terminated_length": 1924.3333333333333, "completions/mean_length": 2327.03125, "completions/mean_terminated_length": 628.4505818684896, "completions/min_length": 201.33333333333334, "completions/min_terminated_length": 201.33333333333334, "entropy": 0.018960915971547365, "epoch": 0.008413461538461538, "frac_reward_zero_std": 0.0, "grad_norm": 0.0007900321506895125, "learning_rate": 9.917067307692307e-07, "loss": 0.0025, "num_tokens": 2061932.0, "reward": 0.44478259483973187, "reward_std": 0.30973803003629047, "rewards/reward_fn/mean": 0.44478259483973187, "rewards/reward_fn/std": 0.30973803003629047, "sampling/importance_sampling_ratio/max": 1.0047172904014587, "sampling/importance_sampling_ratio/mean": 0.16467456022898355, "sampling/importance_sampling_ratio/min": 1.5399872173323576e-05, "sampling/sampling_logp_difference/max": 4.859946012496948, "sampling/sampling_logp_difference/mean": 0.003873621734480063, "step": 70, "step_time": 10.41085409456864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.859375, "completions/max_length": 3000.0, "completions/max_terminated_length": 1629.5, "completions/mean_length": 2685.890625, "completions/mean_terminated_length": 748.5, "completions/min_length": 344.5, "completions/min_terminated_length": 344.5, "entropy": 0.016247186157852412, "epoch": 0.009615384615384616, "frac_reward_zero_std": 0.0, "grad_norm": 0.005373419728130102, "learning_rate": 9.905048076923076e-07, "loss": 0.0029, "num_tokens": 2311917.0, "reward": 0.38243725895881653, "reward_std": 0.2547700107097626, "rewards/reward_fn/mean": 0.38243725895881653, "rewards/reward_fn/std": 0.2547700107097626, "sampling/importance_sampling_ratio/max": 0.5776601135730743, "sampling/importance_sampling_ratio/mean": 0.08489583432674408, "sampling/importance_sampling_ratio/min": 1.8046629293166916e-05, "sampling/sampling_logp_difference/max": 3.992743968963623, "sampling/sampling_logp_difference/mean": 0.0035122547997161746, "step": 80, "step_time": 7.134391188621521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2563.3333333333335, "completions/mean_length": 2032.4583333333333, "completions/mean_terminated_length": 618.4542439778646, "completions/min_length": 155.33333333333334, "completions/min_terminated_length": 155.33333333333334, "entropy": 0.019483743235468864, "epoch": 0.010817307692307692, "frac_reward_zero_std": 0.0, "grad_norm": 0.0023783922661095858, "learning_rate": 9.893028846153846e-07, "loss": 0.0037, "num_tokens": 2609625.0, "reward": 0.4526282449563344, "reward_std": 0.31486642360687256, "rewards/reward_fn/mean": 0.4526282449563344, "rewards/reward_fn/std": 0.31486642360687256, "sampling/importance_sampling_ratio/max": 1.0166077812512715, "sampling/importance_sampling_ratio/mean": 0.21576745311419168, "sampling/importance_sampling_ratio/min": 1.1885360635233155e-05, "sampling/sampling_logp_difference/max": 3.6006526152292886, "sampling/sampling_logp_difference/mean": 0.004151904334624608, "step": 90, "step_time": 10.322812335565686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 3000.0, "completions/max_terminated_length": 1860.5, "completions/mean_length": 1595.15625, "completions/mean_terminated_length": 501.6521759033203, "completions/min_length": 126.5, "completions/min_terminated_length": 126.5, "entropy": 0.019492124672979116, "epoch": 0.01201923076923077, "frac_reward_zero_std": 0.0, "grad_norm": 0.002496167318895459, "learning_rate": 9.881009615384615e-07, "loss": -0.0023, "num_tokens": 2780843.0, "reward": 0.42721714079380035, "reward_std": 0.35759618878364563, "rewards/reward_fn/mean": 0.42721714079380035, "rewards/reward_fn/std": 0.35759617388248444, "sampling/importance_sampling_ratio/max": 1.6687843799591064, "sampling/importance_sampling_ratio/mean": 0.40005573630332947, "sampling/importance_sampling_ratio/min": 4.824376560463861e-06, "sampling/sampling_logp_difference/max": 4.454401969909668, "sampling/sampling_logp_difference/mean": 0.004055082099512219, "step": 100, "step_time": 7.227691024169326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 3000.0, "completions/max_terminated_length": 1931.0, "completions/mean_length": 2288.2395833333335, "completions/mean_terminated_length": 775.4656575520834, "completions/min_length": 274.6666666666667, "completions/min_terminated_length": 274.6666666666667, "entropy": 0.020993015449494123, "epoch": 0.013221153846153846, "frac_reward_zero_std": 0.0, "grad_norm": 0.002677768934518099, "learning_rate": 9.868990384615384e-07, "loss": -0.0026, "num_tokens": 3104250.0, "reward": 0.4922171930472056, "reward_std": 0.2854652901490529, "rewards/reward_fn/mean": 0.4922171930472056, "rewards/reward_fn/std": 0.28546526034673053, "sampling/importance_sampling_ratio/max": 0.6673135856787363, "sampling/importance_sampling_ratio/mean": 0.10157663250962894, "sampling/importance_sampling_ratio/min": 7.01754425828464e-06, "sampling/sampling_logp_difference/max": 3.2912611166636148, "sampling/sampling_logp_difference/mean": 0.004281437645355861, "step": 110, "step_time": 10.542028254549951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.609375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2056.0, "completions/mean_length": 2023.96875, "completions/mean_terminated_length": 503.58335876464844, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.016061992663890124, "epoch": 0.014423076923076924, "frac_reward_zero_std": 0.25, "grad_norm": 0.0001856104499893263, "learning_rate": 9.856971153846154e-07, "loss": -0.004, "num_tokens": 3333584.0, "reward": 0.39645303785800934, "reward_std": 0.3377818614244461, "rewards/reward_fn/mean": 0.39645303785800934, "rewards/reward_fn/std": 0.3377818465232849, "sampling/importance_sampling_ratio/max": 1.0789106488227844, "sampling/importance_sampling_ratio/mean": 0.28015801310539246, "sampling/importance_sampling_ratio/min": 2.0821722046093782e-05, "sampling/sampling_logp_difference/max": 3.950108051300049, "sampling/sampling_logp_difference/mean": 0.003412871388718486, "step": 120, "step_time": 7.4243323159404095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 3000.0, "completions/max_terminated_length": 2158.6666666666665, "completions/mean_length": 2249.2916666666665, "completions/mean_terminated_length": 606.0416666666666, "completions/min_length": 143.66666666666666, "completions/min_terminated_length": 143.66666666666666, "entropy": 0.020050639193505047, "epoch": 0.015625, "frac_reward_zero_std": 0.0, "grad_norm": 0.0012881775619462132, "learning_rate": 9.844951923076923e-07, "loss": -0.0021, "num_tokens": 3667748.0, "reward": 0.5218662818272909, "reward_std": 0.2814968128999074, "rewards/reward_fn/mean": 0.5218662818272909, "rewards/reward_fn/std": 0.2814968128999074, "sampling/importance_sampling_ratio/max": 1.4757625460624695, "sampling/importance_sampling_ratio/mean": 0.1709281007448832, "sampling/importance_sampling_ratio/min": 2.6822768935138203e-05, "sampling/sampling_logp_difference/max": 4.204032103220622, "sampling/sampling_logp_difference/mean": 0.0038598590375234685, "step": 130, "step_time": 10.493045585788787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2117.5, "completions/mean_length": 2631.6875, "completions/mean_terminated_length": 1176.5555725097656, "completions/min_length": 197.5, "completions/min_terminated_length": 197.5, "entropy": 0.018219062034040688, "epoch": 0.016826923076923076, "frac_reward_zero_std": 0.0, "grad_norm": 0.0015549284871667624, "learning_rate": 9.832932692307693e-07, "loss": -0.0009, "num_tokens": 3927576.0, "reward": 0.4733758121728897, "reward_std": 0.22228125482797623, "rewards/reward_fn/mean": 0.4733758121728897, "rewards/reward_fn/std": 0.22228126227855682, "sampling/importance_sampling_ratio/max": 1.2314740419387817, "sampling/importance_sampling_ratio/mean": 0.09447035938501358, "sampling/importance_sampling_ratio/min": 1.052286415870185e-05, "sampling/sampling_logp_difference/max": 4.338261246681213, "sampling/sampling_logp_difference/mean": 0.004096228629350662, "step": 140, "step_time": 7.206147984135896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6145833333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 1697.3333333333333, "completions/mean_length": 2044.0208333333333, "completions/mean_terminated_length": 490.77272542317706, "completions/min_length": 148.66666666666666, "completions/min_terminated_length": 148.66666666666666, "entropy": 0.018800562154501675, "epoch": 0.018028846153846152, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.0006495500565506518, "learning_rate": 9.820913461538462e-07, "loss": -0.0023, "num_tokens": 4236202.0, "reward": 0.46717366576194763, "reward_std": 0.3116947164138158, "rewards/reward_fn/mean": 0.46717366576194763, "rewards/reward_fn/std": 0.31169472138086957, "sampling/importance_sampling_ratio/max": 1.030432403087616, "sampling/importance_sampling_ratio/mean": 0.2207338586449623, "sampling/importance_sampling_ratio/min": 0.00010002030118509235, "sampling/sampling_logp_difference/max": 4.307291348775228, "sampling/sampling_logp_difference/mean": 0.0035794926807284355, "step": 150, "step_time": 10.547765684034676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.734375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2261.5, "completions/mean_length": 2379.921875, "completions/mean_terminated_length": 821.8083801269531, "completions/min_length": 259.5, "completions/min_terminated_length": 259.5, "entropy": 0.015533651318401098, "epoch": 0.019230769230769232, "frac_reward_zero_std": 0.125, "grad_norm": 0.0066866702400147915, "learning_rate": 9.80889423076923e-07, "loss": -0.008, "num_tokens": 4466653.0, "reward": 0.38571617007255554, "reward_std": 0.261713907122612, "rewards/reward_fn/mean": 0.38571617007255554, "rewards/reward_fn/std": 0.2617138996720314, "sampling/importance_sampling_ratio/max": 0.7953649312257767, "sampling/importance_sampling_ratio/mean": 0.20777825824916363, "sampling/importance_sampling_ratio/min": 2.4937685338954907e-05, "sampling/sampling_logp_difference/max": 5.00935435295105, "sampling/sampling_logp_difference/mean": 0.0034976950846612453, "step": 160, "step_time": 7.1709523963741955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8020833333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 2244.3333333333335, "completions/mean_length": 2614.7291666666665, "completions/mean_terminated_length": 1463.7762044270833, "completions/min_length": 931.0, "completions/min_terminated_length": 931.0, "entropy": 0.017413920257240534, "epoch": 0.020432692307692308, "frac_reward_zero_std": 0.0, "grad_norm": 0.0038856256287544966, "learning_rate": 9.796875e-07, "loss": 0.0044, "num_tokens": 4831939.0, "reward": 0.4507540663083394, "reward_std": 0.2550062636534373, "rewards/reward_fn/mean": 0.4507540663083394, "rewards/reward_fn/std": 0.2550062636534373, "sampling/importance_sampling_ratio/max": 1.1696496605873108, "sampling/importance_sampling_ratio/mean": 0.16193894296884537, "sampling/importance_sampling_ratio/min": 5.491954080364773e-05, "sampling/sampling_logp_difference/max": 5.057154814402263, "sampling/sampling_logp_difference/mean": 0.0037571745148549476, "step": 170, "step_time": 10.539024060498923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.90625, "completions/max_length": 3000.0, "completions/max_terminated_length": 1158.5, "completions/mean_length": 2779.046875, "completions/mean_terminated_length": 662.25, "completions/min_length": 274.5, "completions/min_terminated_length": 274.5, "entropy": 0.01769241876900196, "epoch": 0.021634615384615384, "frac_reward_zero_std": 0.0, "grad_norm": 0.00015522421745117754, "learning_rate": 9.784855769230768e-07, "loss": 0.0009, "num_tokens": 5089750.0, "reward": 0.3424798548221588, "reward_std": 0.2154032438993454, "rewards/reward_fn/mean": 0.3424798548221588, "rewards/reward_fn/std": 0.2154032364487648, "sampling/importance_sampling_ratio/max": 0.6384808421134949, "sampling/importance_sampling_ratio/mean": 0.07789264433085918, "sampling/importance_sampling_ratio/min": 4.068531136258713e-05, "sampling/sampling_logp_difference/max": 7.3650572299957275, "sampling/sampling_logp_difference/mean": 0.003942229668609798, "step": 180, "step_time": 7.243509226292372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 3000.0, "completions/max_terminated_length": 1613.3333333333333, "completions/mean_length": 1974.6145833333333, "completions/mean_terminated_length": 486.20741780598956, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.01880432851612568, "epoch": 0.02283653846153846, "frac_reward_zero_std": 0.0, "grad_norm": 0.00722771929576993, "learning_rate": 9.772836538461538e-07, "loss": 0.0169, "num_tokens": 5494793.0, "reward": 0.5388168195883433, "reward_std": 0.3067310154438019, "rewards/reward_fn/mean": 0.5388168195883433, "rewards/reward_fn/std": 0.3067310353120168, "sampling/importance_sampling_ratio/max": 1.7393125693003337, "sampling/importance_sampling_ratio/mean": 0.2721094638109207, "sampling/importance_sampling_ratio/min": 8.882510307254658e-05, "sampling/sampling_logp_difference/max": 4.504290024439494, "sampling/sampling_logp_difference/mean": 0.003792121618365248, "step": 190, "step_time": 11.523718248028308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 3000.0, "completions/max_terminated_length": 1392.0, "completions/mean_length": 2299.859375, "completions/mean_terminated_length": 485.8000183105469, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.01885360199958086, "epoch": 0.02403846153846154, "frac_reward_zero_std": 0.0, "grad_norm": 0.0018633543513715267, "learning_rate": 9.760817307692307e-07, "loss": -0.0016, "num_tokens": 5881016.0, "reward": 0.5193114280700684, "reward_std": 0.2786804586648941, "rewards/reward_fn/mean": 0.5193114280700684, "rewards/reward_fn/std": 0.2786804586648941, "sampling/importance_sampling_ratio/max": 1.055225431919098, "sampling/importance_sampling_ratio/mean": 0.16781341284513474, "sampling/importance_sampling_ratio/min": 3.866622435566569e-07, "sampling/sampling_logp_difference/max": 7.937512397766113, "sampling/sampling_logp_difference/mean": 0.004202596610412002, "step": 200, "step_time": 9.623870695196093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5104166666666666, "completions/max_length": 3000.0, "completions/max_terminated_length": 1461.0, "completions/mean_length": 1755.1458333333333, "completions/mean_terminated_length": 539.2523905436198, "completions/min_length": 143.33333333333334, "completions/min_terminated_length": 143.33333333333334, "entropy": 0.017931907996535303, "epoch": 0.025240384615384616, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.006462108809500933, "learning_rate": 9.748798076923077e-07, "loss": 0.0007, "num_tokens": 6144958.0, "reward": 0.5075608392556509, "reward_std": 0.28299715121587116, "rewards/reward_fn/mean": 0.5075608392556509, "rewards/reward_fn/std": 0.28299715121587116, "sampling/importance_sampling_ratio/max": 1.0999391277631123, "sampling/importance_sampling_ratio/mean": 0.2838456407189369, "sampling/importance_sampling_ratio/min": 0.0001407248931476109, "sampling/sampling_logp_difference/max": 2.3891287247339883, "sampling/sampling_logp_difference/mean": 0.0034232339821755886, "step": 210, "step_time": 10.46414681598544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.609375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2526.0, "completions/mean_length": 2177.640625, "completions/mean_terminated_length": 881.36669921875, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.018199982680380346, "epoch": 0.026442307692307692, "frac_reward_zero_std": 0.0, "grad_norm": 0.0022470273543149233, "learning_rate": 9.736778846153846e-07, "loss": 0.0005, "num_tokens": 6343743.0, "reward": 0.5316094756126404, "reward_std": 0.3387327641248703, "rewards/reward_fn/mean": 0.5316094756126404, "rewards/reward_fn/std": 0.3387327641248703, "sampling/importance_sampling_ratio/max": 1.3275049030780792, "sampling/importance_sampling_ratio/mean": 0.1624165177345276, "sampling/importance_sampling_ratio/min": 8.577288298283747e-06, "sampling/sampling_logp_difference/max": 3.6041401624679565, "sampling/sampling_logp_difference/mean": 0.004056592006236315, "step": 220, "step_time": 7.125956924352795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6354166666666666, "completions/max_length": 3000.0, "completions/max_terminated_length": 1542.6666666666667, "completions/mean_length": 2131.9270833333335, "completions/mean_terminated_length": 637.1281433105469, "completions/min_length": 279.3333333333333, "completions/min_terminated_length": 279.3333333333333, "entropy": 0.019933482632040977, "epoch": 0.027644230769230768, "frac_reward_zero_std": 0.0, "grad_norm": 0.0012092768447473645, "learning_rate": 9.724759615384615e-07, "loss": -0.0042, "num_tokens": 6667776.0, "reward": 0.5298199454943339, "reward_std": 0.29798005024592084, "rewards/reward_fn/mean": 0.5298199454943339, "rewards/reward_fn/std": 0.29798006018002826, "sampling/importance_sampling_ratio/max": 1.1110785206158955, "sampling/importance_sampling_ratio/mean": 0.1435714066028595, "sampling/importance_sampling_ratio/min": 5.234124061341087e-05, "sampling/sampling_logp_difference/max": 3.8322461446126304, "sampling/sampling_logp_difference/mean": 0.004198808843890826, "step": 230, "step_time": 10.486779965274035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 3000.0, "completions/max_terminated_length": 1842.0, "completions/mean_length": 2082.515625, "completions/mean_terminated_length": 741.5769348144531, "completions/min_length": 134.5, "completions/min_terminated_length": 134.5, "entropy": 0.01685540536418557, "epoch": 0.028846153846153848, "frac_reward_zero_std": 0.125, "grad_norm": 0.0014095621882006526, "learning_rate": 9.712740384615385e-07, "loss": 0.0103, "num_tokens": 6893265.0, "reward": 0.3671119213104248, "reward_std": 0.32736505568027496, "rewards/reward_fn/mean": 0.3671119213104248, "rewards/reward_fn/std": 0.32736505568027496, "sampling/importance_sampling_ratio/max": 1.884002536535263, "sampling/importance_sampling_ratio/mean": 0.3767280876636505, "sampling/importance_sampling_ratio/min": 1.3983759345137514e-05, "sampling/sampling_logp_difference/max": 4.271053791046143, "sampling/sampling_logp_difference/mean": 0.0034473949344828725, "step": 240, "step_time": 7.27706265822053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6041666666666666, "completions/max_length": 3000.0, "completions/max_terminated_length": 2432.3333333333335, "completions/mean_length": 2074.0104166666665, "completions/mean_terminated_length": 918.021250406901, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.018033031187951565, "epoch": 0.030048076923076924, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.01246865838766098, "learning_rate": 9.700721153846152e-07, "loss": 0.0088, "num_tokens": 7195010.0, "reward": 0.4532705942789714, "reward_std": 0.3101769983768463, "rewards/reward_fn/mean": 0.4532705942789714, "rewards/reward_fn/std": 0.31017698844273883, "sampling/importance_sampling_ratio/max": 1.1092299222946167, "sampling/importance_sampling_ratio/mean": 0.24546280006567636, "sampling/importance_sampling_ratio/min": 0.00021196592950426899, "sampling/sampling_logp_difference/max": 7.934118588765462, "sampling/sampling_logp_difference/mean": 0.003929695657764872, "step": 250, "step_time": 10.33738683918491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8125, "completions/max_length": 3000.0, "completions/max_terminated_length": 1549.0, "completions/mean_length": 2571.078125, "completions/mean_terminated_length": 712.4166870117188, "completions/min_length": 219.5, "completions/min_terminated_length": 219.5, "entropy": 0.016906077042222023, "epoch": 0.03125, "frac_reward_zero_std": 0.0, "grad_norm": 0.005458444356918335, "learning_rate": 9.688701923076924e-07, "loss": 0.0, "num_tokens": 7426583.0, "reward": 0.3408375531435013, "reward_std": 0.2221287488937378, "rewards/reward_fn/mean": 0.3408375531435013, "rewards/reward_fn/std": 0.2221287414431572, "sampling/importance_sampling_ratio/max": 1.2117692828178406, "sampling/importance_sampling_ratio/mean": 0.14858610928058624, "sampling/importance_sampling_ratio/min": 0.00018990332318935543, "sampling/sampling_logp_difference/max": 3.382055640220642, "sampling/sampling_logp_difference/mean": 0.003656316315755248, "step": 260, "step_time": 7.073525336291641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6458333333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 1339.0, "completions/mean_length": 2102.0729166666665, "completions/mean_terminated_length": 442.78740946451825, "completions/min_length": 126.33333333333333, "completions/min_terminated_length": 126.33333333333333, "entropy": 0.016795550100505353, "epoch": 0.03245192307692308, "frac_reward_zero_std": 0.0, "grad_norm": 0.004597453400492668, "learning_rate": 9.67668269230769e-07, "loss": -0.0024, "num_tokens": 7729718.0, "reward": 0.387935350338618, "reward_std": 0.29140980045000714, "rewards/reward_fn/mean": 0.387935350338618, "rewards/reward_fn/std": 0.29140981038411456, "sampling/importance_sampling_ratio/max": 1.067175288995107, "sampling/importance_sampling_ratio/mean": 0.24855677783489227, "sampling/importance_sampling_ratio/min": 0.00014689077541637138, "sampling/sampling_logp_difference/max": 4.973676681518555, "sampling/sampling_logp_difference/mean": 0.0037352641423543296, "step": 270, "step_time": 10.446550993807614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.65625, "completions/max_length": 3000.0, "completions/max_terminated_length": 2063.5, "completions/mean_length": 2260.296875, "completions/mean_terminated_length": 993.9285888671875, "completions/min_length": 440.0, "completions/min_terminated_length": 440.0, "entropy": 0.018303512316197156, "epoch": 0.03365384615384615, "frac_reward_zero_std": 0.0, "grad_norm": 0.0014140036655589938, "learning_rate": 9.66466346153846e-07, "loss": -0.0054, "num_tokens": 7943017.0, "reward": 0.49134431779384613, "reward_std": 0.3223358988761902, "rewards/reward_fn/mean": 0.49134431779384613, "rewards/reward_fn/std": 0.322335883975029, "sampling/importance_sampling_ratio/max": 0.7534581571817398, "sampling/importance_sampling_ratio/mean": 0.15249896422028542, "sampling/importance_sampling_ratio/min": 2.40880672208732e-06, "sampling/sampling_logp_difference/max": 4.226468563079834, "sampling/sampling_logp_difference/mean": 0.003883342957124114, "step": 280, "step_time": 7.138110697641968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2213.0, "completions/mean_length": 1980.5625, "completions/mean_terminated_length": 606.0944112141927, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "entropy": 0.020054421667009593, "epoch": 0.03485576923076923, "frac_reward_zero_std": 0.0, "grad_norm": 0.0014058782253414392, "learning_rate": 9.65264423076923e-07, "loss": -0.0044, "num_tokens": 8242863.0, "reward": 0.46114611625671387, "reward_std": 0.31637027362982434, "rewards/reward_fn/mean": 0.46114611625671387, "rewards/reward_fn/std": 0.3163702537616094, "sampling/importance_sampling_ratio/max": 0.8900170723597208, "sampling/importance_sampling_ratio/mean": 0.20176845292250314, "sampling/importance_sampling_ratio/min": 4.526400850105953e-05, "sampling/sampling_logp_difference/max": 3.990814765294393, "sampling/sampling_logp_difference/mean": 0.0039402142477532225, "step": 290, "step_time": 10.486494569014758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 3000.0, "completions/max_terminated_length": 1905.5, "completions/mean_length": 1544.5625, "completions/mean_terminated_length": 412.55555725097656, "completions/min_length": 139.5, "completions/min_terminated_length": 139.5, "entropy": 0.018313859310001134, "epoch": 0.036057692307692304, "frac_reward_zero_std": 0.0, "grad_norm": 0.003455411409959197, "learning_rate": 9.640625e-07, "loss": -0.0106, "num_tokens": 8413499.0, "reward": 0.5526554584503174, "reward_std": 0.35268209874629974, "rewards/reward_fn/mean": 0.5526554584503174, "rewards/reward_fn/std": 0.35268208384513855, "sampling/importance_sampling_ratio/max": 1.3423713445663452, "sampling/importance_sampling_ratio/mean": 0.354382187128067, "sampling/importance_sampling_ratio/min": 0.0004160461248829961, "sampling/sampling_logp_difference/max": 3.2846263647079468, "sampling/sampling_logp_difference/mean": 0.0038829914992675185, "step": 300, "step_time": 7.181526190880686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 3000.0, "completions/max_terminated_length": 1651.6666666666667, "completions/mean_length": 1933.0833333333333, "completions/mean_terminated_length": 506.2410074869792, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.01788173634558916, "epoch": 0.037259615384615384, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.0024897113908082247, "learning_rate": 9.628605769230769e-07, "loss": 0.0052, "num_tokens": 8710771.0, "reward": 0.5737349589665731, "reward_std": 0.26112934450308484, "rewards/reward_fn/mean": 0.5737349589665731, "rewards/reward_fn/std": 0.26112934450308484, "sampling/importance_sampling_ratio/max": 1.3681105772654216, "sampling/importance_sampling_ratio/mean": 0.29560016095638275, "sampling/importance_sampling_ratio/min": 0.00018242035595979664, "sampling/sampling_logp_difference/max": 3.4793038368225098, "sampling/sampling_logp_difference/mean": 0.0035757292062044144, "step": 310, "step_time": 10.487889216095208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.546875, "completions/max_length": 3000.0, "completions/max_terminated_length": 2127.5, "completions/mean_length": 1947.59375, "completions/mean_terminated_length": 722.896484375, "completions/min_length": 131.5, "completions/min_terminated_length": 131.5, "entropy": 0.021266778744757176, "epoch": 0.038461538461538464, "frac_reward_zero_std": 0.0, "grad_norm": 0.0037867010105401278, "learning_rate": 9.616586538461538e-07, "loss": 0.0026, "num_tokens": 8909241.0, "reward": 0.6189470589160919, "reward_std": 0.25249937176704407, "rewards/reward_fn/mean": 0.6189470589160919, "rewards/reward_fn/std": 0.25249937176704407, "sampling/importance_sampling_ratio/max": 1.0580606162548065, "sampling/importance_sampling_ratio/mean": 0.15583503991365433, "sampling/importance_sampling_ratio/min": 0.0001919468049891293, "sampling/sampling_logp_difference/max": 5.763197660446167, "sampling/sampling_logp_difference/mean": 0.004475279012694955, "step": 320, "step_time": 7.157194932084531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6770833333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 1628.3333333333333, "completions/mean_length": 2230.5208333333335, "completions/mean_terminated_length": 656.1583557128906, "completions/min_length": 225.33333333333334, "completions/min_terminated_length": 225.33333333333334, "entropy": 0.020964619889855384, "epoch": 0.039663461538461536, "frac_reward_zero_std": 0.0, "grad_norm": 0.0012933936668559909, "learning_rate": 9.604567307692308e-07, "loss": 0.0027, "num_tokens": 9244147.0, "reward": 0.5332527160644531, "reward_std": 0.29489131768544513, "rewards/reward_fn/mean": 0.5332527160644531, "rewards/reward_fn/std": 0.29489131768544513, "sampling/importance_sampling_ratio/max": 1.3274634679158528, "sampling/importance_sampling_ratio/mean": 0.18171557784080505, "sampling/importance_sampling_ratio/min": 1.0075919666026797e-05, "sampling/sampling_logp_difference/max": 3.738051096598307, "sampling/sampling_logp_difference/mean": 0.004341102205216885, "step": 330, "step_time": 10.577109040413053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.515625, "completions/max_length": 3000.0, "completions/max_terminated_length": 1800.0, "completions/mean_length": 1752.078125, "completions/mean_terminated_length": 544.9429321289062, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.019728436879813672, "epoch": 0.040865384615384616, "frac_reward_zero_std": 0.125, "grad_norm": 0.003659631824120879, "learning_rate": 9.592548076923077e-07, "loss": -0.0087, "num_tokens": 9429456.0, "reward": 0.5022579580545425, "reward_std": 0.3523288071155548, "rewards/reward_fn/mean": 0.5022579580545425, "rewards/reward_fn/std": 0.3523288071155548, "sampling/importance_sampling_ratio/max": 1.4135321080684662, "sampling/importance_sampling_ratio/mean": 0.33609023690223694, "sampling/importance_sampling_ratio/min": 0.00029637124316650443, "sampling/sampling_logp_difference/max": 3.1204572916030884, "sampling/sampling_logp_difference/mean": 0.004011535318568349, "step": 340, "step_time": 7.212607515975833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6666666666666666, "completions/max_length": 3000.0, "completions/max_terminated_length": 2725.3333333333335, "completions/mean_length": 2380.375, "completions/mean_terminated_length": 1125.7722574869792, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "entropy": 0.017688877321779727, "epoch": 0.042067307692307696, "frac_reward_zero_std": 0.0, "grad_norm": 0.0018405967857688665, "learning_rate": 9.580528846153846e-07, "loss": -0.0064, "num_tokens": 9753852.0, "reward": 0.454216072956721, "reward_std": 0.3175173004468282, "rewards/reward_fn/mean": 0.454216072956721, "rewards/reward_fn/std": 0.3175172805786133, "sampling/importance_sampling_ratio/max": 1.2488041321436565, "sampling/importance_sampling_ratio/mean": 0.13971723864475885, "sampling/importance_sampling_ratio/min": 3.0134233080995425e-05, "sampling/sampling_logp_difference/max": 2.6658068100611367, "sampling/sampling_logp_difference/mean": 0.00363180716522038, "step": 350, "step_time": 10.458810392487795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.640625, "completions/max_length": 3000.0, "completions/max_terminated_length": 2158.0, "completions/mean_length": 2149.265625, "completions/mean_terminated_length": 739.0490417480469, "completions/min_length": 137.5, "completions/min_terminated_length": 137.5, "entropy": 0.01617119777947664, "epoch": 0.04326923076923077, "frac_reward_zero_std": 0.125, "grad_norm": 0.0015890736831352115, "learning_rate": 9.568509615384614e-07, "loss": -0.0, "num_tokens": 9951757.0, "reward": 0.4698807895183563, "reward_std": 0.3345671743154526, "rewards/reward_fn/mean": 0.4698807895183563, "rewards/reward_fn/std": 0.3345671743154526, "sampling/importance_sampling_ratio/max": 0.9479912221431732, "sampling/importance_sampling_ratio/mean": 0.22397014126181602, "sampling/importance_sampling_ratio/min": 3.853435828204965e-06, "sampling/sampling_logp_difference/max": 6.253631591796875, "sampling/sampling_logp_difference/mean": 0.003995730075985193, "step": 360, "step_time": 7.17925412254408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.7083333333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 1581.0, "completions/mean_length": 2347.8541666666665, "completions/mean_terminated_length": 700.5809834798177, "completions/min_length": 181.66666666666666, "completions/min_terminated_length": 181.66666666666666, "entropy": 0.01952901417389512, "epoch": 0.04447115384615385, "frac_reward_zero_std": 0.0, "grad_norm": 0.004848910961300135, "learning_rate": 9.556490384615385e-07, "loss": -0.0018, "num_tokens": 10290351.0, "reward": 0.4845337967077891, "reward_std": 0.2866665820280711, "rewards/reward_fn/mean": 0.4845337967077891, "rewards/reward_fn/std": 0.2866665720939636, "sampling/importance_sampling_ratio/max": 1.324170470237732, "sampling/importance_sampling_ratio/mean": 0.15245153258244196, "sampling/importance_sampling_ratio/min": 3.066437360151516e-05, "sampling/sampling_logp_difference/max": 3.500709295272827, "sampling/sampling_logp_difference/mean": 0.0040606907568871975, "step": 370, "step_time": 10.532285568118095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 3000.0, "completions/max_terminated_length": 1740.5, "completions/mean_length": 1720.5, "completions/mean_terminated_length": 525.9301452636719, "completions/min_length": 159.5, "completions/min_terminated_length": 159.5, "entropy": 0.020436960272490978, "epoch": 0.04567307692307692, "frac_reward_zero_std": 0.0, "grad_norm": 0.004832131322473288, "learning_rate": 9.544471153846153e-07, "loss": -0.0029, "num_tokens": 10479367.0, "reward": 0.6267639696598053, "reward_std": 0.3001161217689514, "rewards/reward_fn/mean": 0.6267639696598053, "rewards/reward_fn/std": 0.3001161068677902, "sampling/importance_sampling_ratio/max": 1.6554638147354126, "sampling/importance_sampling_ratio/mean": 0.24717678874731064, "sampling/importance_sampling_ratio/min": 0.00013800638771499507, "sampling/sampling_logp_difference/max": 4.938496828079224, "sampling/sampling_logp_difference/mean": 0.004146568709984422, "step": 380, "step_time": 7.348376268800348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6354166666666666, "completions/max_length": 3000.0, "completions/max_terminated_length": 2296.3333333333335, "completions/mean_length": 2230.125, "completions/mean_terminated_length": 857.3701375325521, "completions/min_length": 222.66666666666666, "completions/min_terminated_length": 222.66666666666666, "entropy": 0.02126995287835598, "epoch": 0.046875, "frac_reward_zero_std": 0.0, "grad_norm": 0.009783285669982433, "learning_rate": 9.532451923076923e-07, "loss": -0.0029, "num_tokens": 10794459.0, "reward": 0.47101734081904095, "reward_std": 0.288085013628006, "rewards/reward_fn/mean": 0.47101734081904095, "rewards/reward_fn/std": 0.288085013628006, "sampling/importance_sampling_ratio/max": 0.881293515364329, "sampling/importance_sampling_ratio/mean": 0.13057787219683328, "sampling/importance_sampling_ratio/min": 1.227038620414215e-05, "sampling/sampling_logp_difference/max": 8.208312431971232, "sampling/sampling_logp_difference/mean": 0.004447052255272865, "step": 390, "step_time": 10.411797516420483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 3000.0, "completions/max_terminated_length": 520.5, "completions/mean_length": 1288.65625, "completions/mean_terminated_length": 207.48078155517578, "completions/min_length": 122.5, "completions/min_terminated_length": 122.5, "entropy": 0.014642491471022367, "epoch": 0.04807692307692308, "frac_reward_zero_std": 0.125, "grad_norm": 0.0020279414020478725, "learning_rate": 9.520432692307691e-07, "loss": -0.02, "num_tokens": 10951797.0, "reward": 0.47283533215522766, "reward_std": 0.3358086496591568, "rewards/reward_fn/mean": 0.47283533215522766, "rewards/reward_fn/std": 0.335808664560318, "sampling/importance_sampling_ratio/max": 1.5041191577911377, "sampling/importance_sampling_ratio/mean": 0.481951579451561, "sampling/importance_sampling_ratio/min": 8.213657542910369e-05, "sampling/sampling_logp_difference/max": 3.3500800132751465, "sampling/sampling_logp_difference/mean": 0.002975012524984777, "step": 400, "step_time": 6.991425440739841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 3000.0, "completions/max_terminated_length": 1266.6666666666667, "completions/mean_length": 1890.3645833333333, "completions/mean_terminated_length": 499.27845255533856, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.01970140039920807, "epoch": 0.04927884615384615, "frac_reward_zero_std": 0.16666666666666666, "grad_norm": 0.0017067261505872011, "learning_rate": 9.508413461538462e-07, "loss": 0.0009, "num_tokens": 11234952.0, "reward": 0.5022187928358713, "reward_std": 0.29443572958310443, "rewards/reward_fn/mean": 0.5022187928358713, "rewards/reward_fn/std": 0.294435719648997, "sampling/importance_sampling_ratio/max": 0.9803973038991293, "sampling/importance_sampling_ratio/mean": 0.2477222979068756, "sampling/importance_sampling_ratio/min": 4.8608192476725286e-05, "sampling/sampling_logp_difference/max": 3.6099753379821777, "sampling/sampling_logp_difference/mean": 0.004195692483335733, "step": 410, "step_time": 10.36977256089449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 3000.0, "completions/max_terminated_length": 1571.0, "completions/mean_length": 2022.15625, "completions/mean_terminated_length": 618.7857208251953, "completions/min_length": 93.5, "completions/min_terminated_length": 93.5, "entropy": 0.020211321674287318, "epoch": 0.05048076923076923, "frac_reward_zero_std": 0.0, "grad_norm": 0.005331914871931076, "learning_rate": 9.49639423076923e-07, "loss": -0.0006, "num_tokens": 11437994.0, "reward": 0.5758581459522247, "reward_std": 0.2999257743358612, "rewards/reward_fn/mean": 0.5758581459522247, "rewards/reward_fn/std": 0.2999257892370224, "sampling/importance_sampling_ratio/max": 1.7051409482955933, "sampling/importance_sampling_ratio/mean": 0.23473121970891953, "sampling/importance_sampling_ratio/min": 3.312808775035592e-05, "sampling/sampling_logp_difference/max": 2.7839633226394653, "sampling/sampling_logp_difference/mean": 0.0037573095178231597, "step": 420, "step_time": 7.1433165620081125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6770833333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 1836.3333333333333, "completions/mean_length": 2291.9166666666665, "completions/mean_terminated_length": 900.791005452474, "completions/min_length": 197.33333333333334, "completions/min_terminated_length": 197.33333333333334, "entropy": 0.01927726911380887, "epoch": 0.051682692307692304, "frac_reward_zero_std": 0.0, "grad_norm": 0.004541941452771425, "learning_rate": 9.484375e-07, "loss": -0.0023, "num_tokens": 11754466.0, "reward": 0.5143583516279856, "reward_std": 0.25600699583689374, "rewards/reward_fn/mean": 0.5143583516279856, "rewards/reward_fn/std": 0.25600700080394745, "sampling/importance_sampling_ratio/max": 0.8139317234357198, "sampling/importance_sampling_ratio/mean": 0.12806004906694093, "sampling/importance_sampling_ratio/min": 0.00012672227228449628, "sampling/sampling_logp_difference/max": 4.396458625793457, "sampling/sampling_logp_difference/mean": 0.0043079000897705555, "step": 430, "step_time": 10.431806867942214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.578125, "completions/max_length": 3000.0, "completions/max_terminated_length": 1434.0, "completions/mean_length": 1946.609375, "completions/mean_terminated_length": 525.477294921875, "completions/min_length": 133.5, "completions/min_terminated_length": 133.5, "entropy": 0.020027481392025946, "epoch": 0.052884615384615384, "frac_reward_zero_std": 0.0, "grad_norm": 0.005735191982239485, "learning_rate": 9.472355769230769e-07, "loss": 0.0033, "num_tokens": 11946841.0, "reward": 0.505246102809906, "reward_std": 0.31909091770648956, "rewards/reward_fn/mean": 0.505246102809906, "rewards/reward_fn/std": 0.31909091770648956, "sampling/importance_sampling_ratio/max": 1.0731392800807953, "sampling/importance_sampling_ratio/mean": 0.23387114703655243, "sampling/importance_sampling_ratio/min": 8.600991895946208e-05, "sampling/sampling_logp_difference/max": 4.879274368286133, "sampling/sampling_logp_difference/mean": 0.004000650369562209, "step": 440, "step_time": 7.2324357909150425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 3000.0, "completions/max_terminated_length": 1584.0, "completions/mean_length": 1712.1666666666667, "completions/mean_terminated_length": 454.37574259440106, "completions/min_length": 122.33333333333333, "completions/min_terminated_length": 122.33333333333333, "entropy": 0.017116258572787046, "epoch": 0.054086538461538464, "frac_reward_zero_std": 0.16666666666666666, "grad_norm": 0.010815051384270191, "learning_rate": 9.460336538461539e-07, "loss": -0.0045, "num_tokens": 12229473.0, "reward": 0.5329016546408335, "reward_std": 0.29515260457992554, "rewards/reward_fn/mean": 0.5329016546408335, "rewards/reward_fn/std": 0.29515259464581806, "sampling/importance_sampling_ratio/max": 1.4598920345306396, "sampling/importance_sampling_ratio/mean": 0.3515676458676656, "sampling/importance_sampling_ratio/min": 6.264027767125906e-05, "sampling/sampling_logp_difference/max": 2.463268995285034, "sampling/sampling_logp_difference/mean": 0.003314345687006911, "step": 450, "step_time": 10.54611083548516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.65625, "completions/max_length": 3000.0, "completions/max_terminated_length": 1190.0, "completions/mean_length": 2167.125, "completions/mean_terminated_length": 561.1666870117188, "completions/min_length": 142.5, "completions/min_terminated_length": 142.5, "entropy": 0.015575892757624388, "epoch": 0.055288461538461536, "frac_reward_zero_std": 0.125, "grad_norm": 0.001656687818467617, "learning_rate": 9.448317307692307e-07, "loss": -0.0006, "num_tokens": 12451577.0, "reward": 0.44146084785461426, "reward_std": 0.26547394692897797, "rewards/reward_fn/mean": 0.44146084785461426, "rewards/reward_fn/std": 0.26547394692897797, "sampling/importance_sampling_ratio/max": 1.0446627140045166, "sampling/importance_sampling_ratio/mean": 0.24866275489330292, "sampling/importance_sampling_ratio/min": 0.0001046641991706565, "sampling/sampling_logp_difference/max": 2.7838305234909058, "sampling/sampling_logp_difference/mean": 0.003334745648317039, "step": 460, "step_time": 7.24424380660057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 3000.0, "completions/max_terminated_length": 2222.0, "completions/mean_length": 1852.0729166666667, "completions/mean_terminated_length": 429.7586975097656, "completions/min_length": 135.66666666666666, "completions/min_terminated_length": 135.66666666666666, "entropy": 0.017807922326028347, "epoch": 0.056490384615384616, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.0010989411966875196, "learning_rate": 9.436298076923076e-07, "loss": -0.0004, "num_tokens": 12732856.0, "reward": 0.4656115670998891, "reward_std": 0.3275886078675588, "rewards/reward_fn/mean": 0.4656115670998891, "rewards/reward_fn/std": 0.3275886078675588, "sampling/importance_sampling_ratio/max": 1.6520040035247803, "sampling/importance_sampling_ratio/mean": 0.3156833698352178, "sampling/importance_sampling_ratio/min": 1.8532690622426947e-05, "sampling/sampling_logp_difference/max": 14.46607263882955, "sampling/sampling_logp_difference/mean": 0.003909654139230649, "step": 470, "step_time": 10.53606793107465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.515625, "completions/max_length": 3000.0, "completions/max_terminated_length": 2555.5, "completions/mean_length": 1811.515625, "completions/mean_terminated_length": 664.6690673828125, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.02089017890393734, "epoch": 0.057692307692307696, "frac_reward_zero_std": 0.0, "grad_norm": 0.0047796317376196384, "learning_rate": 9.424278846153846e-07, "loss": -0.0019, "num_tokens": 12924457.0, "reward": 0.531330406665802, "reward_std": 0.3211039751768112, "rewards/reward_fn/mean": 0.531330406665802, "rewards/reward_fn/std": 0.3211039900779724, "sampling/importance_sampling_ratio/max": 0.896709531545639, "sampling/importance_sampling_ratio/mean": 0.25416022539138794, "sampling/importance_sampling_ratio/min": 0.00012278702433832223, "sampling/sampling_logp_difference/max": 2.093716025352478, "sampling/sampling_logp_difference/mean": 0.0038045194232836366, "step": 480, "step_time": 7.287801812589168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5833333333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 2215.3333333333335, "completions/mean_length": 2050.03125, "completions/mean_terminated_length": 727.5606486002604, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "entropy": 0.018715509213507175, "epoch": 0.05889423076923077, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.0007588259759359062, "learning_rate": 9.412259615384614e-07, "loss": -0.0017, "num_tokens": 13223804.0, "reward": 0.48693429430325824, "reward_std": 0.2928757468859355, "rewards/reward_fn/mean": 0.48693429430325824, "rewards/reward_fn/std": 0.2928757468859355, "sampling/importance_sampling_ratio/max": 1.058423936367035, "sampling/importance_sampling_ratio/mean": 0.20468469709157944, "sampling/importance_sampling_ratio/min": 0.00015804603692023042, "sampling/sampling_logp_difference/max": 4.971773783365886, "sampling/sampling_logp_difference/mean": 0.0034401651937514544, "step": 490, "step_time": 10.436008455231786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.78125, "completions/max_length": 3000.0, "completions/max_terminated_length": 1574.0, "completions/mean_length": 2488.21875, "completions/mean_terminated_length": 726.2444610595703, "completions/min_length": 242.5, "completions/min_terminated_length": 242.5, "entropy": 0.018788694590330123, "epoch": 0.06009615384615385, "frac_reward_zero_std": 0.0, "grad_norm": 0.0017740951152518392, "learning_rate": 9.400240384615385e-07, "loss": -0.0033, "num_tokens": 13462746.0, "reward": 0.4546237587928772, "reward_std": 0.2579360082745552, "rewards/reward_fn/mean": 0.4546237587928772, "rewards/reward_fn/std": 0.2579360157251358, "sampling/importance_sampling_ratio/max": 1.0392012000083923, "sampling/importance_sampling_ratio/mean": 0.13464021682739258, "sampling/importance_sampling_ratio/min": 6.8751412527490174e-06, "sampling/sampling_logp_difference/max": 3.7889742851257324, "sampling/sampling_logp_difference/mean": 0.004060836159624159, "step": 500, "step_time": 7.263504793774336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5729166666666666, "completions/max_length": 3000.0, "completions/max_terminated_length": 2394.6666666666665, "completions/mean_length": 2056.7708333333335, "completions/mean_terminated_length": 803.0208536783854, "completions/min_length": 165.66666666666666, "completions/min_terminated_length": 165.66666666666666, "entropy": 0.01841348400339484, "epoch": 0.06129807692307692, "frac_reward_zero_std": 0.0, "grad_norm": 0.00653255358338356, "learning_rate": 9.388221153846153e-07, "loss": -0.0006, "num_tokens": 13761580.0, "reward": 0.552949070930481, "reward_std": 0.30228544771671295, "rewards/reward_fn/mean": 0.552949070930481, "rewards/reward_fn/std": 0.30228545268376666, "sampling/importance_sampling_ratio/max": 1.077454686164856, "sampling/importance_sampling_ratio/mean": 0.18605306247870126, "sampling/importance_sampling_ratio/min": 7.194033605628647e-05, "sampling/sampling_logp_difference/max": 2.380366643269857, "sampling/sampling_logp_difference/mean": 0.0038011238599816957, "step": 510, "step_time": 10.476542139053345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.546875, "completions/max_length": 3000.0, "completions/max_terminated_length": 1779.0, "completions/mean_length": 1972.875, "completions/mean_terminated_length": 720.8952484130859, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.020831465162336825, "epoch": 0.0625, "frac_reward_zero_std": 0.0, "grad_norm": 0.005241528153419495, "learning_rate": 9.376201923076922e-07, "loss": 0.0008, "num_tokens": 13952996.0, "reward": 0.5298287272453308, "reward_std": 0.3535000681877136, "rewards/reward_fn/mean": 0.5298287272453308, "rewards/reward_fn/std": 0.35350003838539124, "sampling/importance_sampling_ratio/max": 1.6554334163665771, "sampling/importance_sampling_ratio/mean": 0.23632808029651642, "sampling/importance_sampling_ratio/min": 0.000110775807115715, "sampling/sampling_logp_difference/max": 2.292133629322052, "sampling/sampling_logp_difference/mean": 0.004395528696477413, "step": 520, "step_time": 7.11848857384175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 3000.0, "completions/max_terminated_length": 1715.6666666666667, "completions/mean_length": 2007.8541666666667, "completions/mean_terminated_length": 539.8067016601562, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.020731207355856894, "epoch": 0.06370192307692307, "frac_reward_zero_std": 0.0, "grad_norm": 0.007486451417207718, "learning_rate": 9.364182692307692e-07, "loss": -0.0027, "num_tokens": 14264166.0, "reward": 0.5552447239557902, "reward_std": 0.29353442788124084, "rewards/reward_fn/mean": 0.5552447239557902, "rewards/reward_fn/std": 0.29353441794713336, "sampling/importance_sampling_ratio/max": 2.0823081731796265, "sampling/importance_sampling_ratio/mean": 0.2820290724436442, "sampling/importance_sampling_ratio/min": 1.837722023386353e-05, "sampling/sampling_logp_difference/max": 1.9503083229064941, "sampling/sampling_logp_difference/mean": 0.00402392353862524, "step": 530, "step_time": 10.53378991521895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 3000.0, "completions/max_terminated_length": 1658.0, "completions/mean_length": 2370.984375, "completions/mean_terminated_length": 833.2013244628906, "completions/min_length": 121.5, "completions/min_terminated_length": 121.5, "entropy": 0.020174438133835793, "epoch": 0.06490384615384616, "frac_reward_zero_std": 0.0, "grad_norm": 0.0030419451650232077, "learning_rate": 9.352163461538461e-07, "loss": -0.0003, "num_tokens": 14476941.0, "reward": 0.48654815554618835, "reward_std": 0.27648722380399704, "rewards/reward_fn/mean": 0.48654815554618835, "rewards/reward_fn/std": 0.27648720890283585, "sampling/importance_sampling_ratio/max": 1.3314348459243774, "sampling/importance_sampling_ratio/mean": 0.17295292392373085, "sampling/importance_sampling_ratio/min": 9.060100433547734e-06, "sampling/sampling_logp_difference/max": 11.633838653564453, "sampling/sampling_logp_difference/mean": 0.004425032529979944, "step": 540, "step_time": 7.1496186051517725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 3000.0, "completions/max_terminated_length": 2339.3333333333335, "completions/mean_length": 2090.9270833333335, "completions/mean_terminated_length": 741.8851114908854, "completions/min_length": 163.33333333333334, "completions/min_terminated_length": 163.33333333333334, "entropy": 0.01858160048723221, "epoch": 0.06610576923076923, "frac_reward_zero_std": 0.0, "grad_norm": 0.000984502024948597, "learning_rate": 9.34014423076923e-07, "loss": -0.0026, "num_tokens": 14772246.0, "reward": 0.5447824001312256, "reward_std": 0.24193347493807474, "rewards/reward_fn/mean": 0.5447824001312256, "rewards/reward_fn/std": 0.24193347990512848, "sampling/importance_sampling_ratio/max": 1.4949373801549275, "sampling/importance_sampling_ratio/mean": 0.224141796429952, "sampling/importance_sampling_ratio/min": 0.00019897352224991968, "sampling/sampling_logp_difference/max": 2.2290508349736533, "sampling/sampling_logp_difference/mean": 0.0039052446372807026, "step": 550, "step_time": 10.367718885932117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.546875, "completions/max_length": 3000.0, "completions/max_terminated_length": 1152.5, "completions/mean_length": 1817.046875, "completions/mean_terminated_length": 324.4868469238281, "completions/min_length": 93.5, "completions/min_terminated_length": 93.5, "entropy": 0.016833850368857383, "epoch": 0.0673076923076923, "frac_reward_zero_std": 0.125, "grad_norm": 0.0014541674172505736, "learning_rate": 9.328125e-07, "loss": -0.0001, "num_tokens": 14964425.0, "reward": 0.41559073328971863, "reward_std": 0.2830698639154434, "rewards/reward_fn/mean": 0.41559073328971863, "rewards/reward_fn/std": 0.2830698639154434, "sampling/importance_sampling_ratio/max": 1.0776475965976715, "sampling/importance_sampling_ratio/mean": 0.3436201214790344, "sampling/importance_sampling_ratio/min": 2.154937203624474e-06, "sampling/sampling_logp_difference/max": 8.799595952033997, "sampling/sampling_logp_difference/mean": 0.003420948749408126, "step": 560, "step_time": 7.040915575902909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4791666666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 1044.0, "completions/mean_length": 1632.28125, "completions/mean_terminated_length": 380.5096740722656, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.017076365742832424, "epoch": 0.06850961538461539, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.0016128809656947851, "learning_rate": 9.316105769230769e-07, "loss": 0.002, "num_tokens": 15244356.0, "reward": 0.5436364610989889, "reward_std": 0.33340315024058026, "rewards/reward_fn/mean": 0.5436364610989889, "rewards/reward_fn/std": 0.3334031403064728, "sampling/importance_sampling_ratio/max": 1.7526110410690308, "sampling/importance_sampling_ratio/mean": 0.39272446433703107, "sampling/importance_sampling_ratio/min": 0.0002856162460602718, "sampling/sampling_logp_difference/max": 2.9282776514689126, "sampling/sampling_logp_difference/mean": 0.0034368912844608226, "step": 570, "step_time": 10.337140784040093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.609375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2370.0, "completions/mean_length": 2122.765625, "completions/mean_terminated_length": 793.7954864501953, "completions/min_length": 154.5, "completions/min_terminated_length": 154.5, "entropy": 0.020896189473569395, "epoch": 0.06971153846153846, "frac_reward_zero_std": 0.0, "grad_norm": 0.004186786245554686, "learning_rate": 9.304086538461539e-07, "loss": 0.0032, "num_tokens": 15466029.0, "reward": 0.5507475733757019, "reward_std": 0.29975010454654694, "rewards/reward_fn/mean": 0.5507475733757019, "rewards/reward_fn/std": 0.29975010454654694, "sampling/importance_sampling_ratio/max": 1.6351932287216187, "sampling/importance_sampling_ratio/mean": 0.18882319331169128, "sampling/importance_sampling_ratio/min": 2.13742925581073e-08, "sampling/sampling_logp_difference/max": 3.5755598545074463, "sampling/sampling_logp_difference/mean": 0.00399893126450479, "step": 580, "step_time": 7.327897801250219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 3000.0, "completions/max_terminated_length": 1876.3333333333333, "completions/mean_length": 1927.6666666666667, "completions/mean_terminated_length": 548.2206522623698, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.017463269736617805, "epoch": 0.07091346153846154, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.0023495464120060205, "learning_rate": 9.292067307692307e-07, "loss": -0.0004, "num_tokens": 15777949.0, "reward": 0.5059513847033182, "reward_std": 0.30850990613301593, "rewards/reward_fn/mean": 0.5059513847033182, "rewards/reward_fn/std": 0.3085099111000697, "sampling/importance_sampling_ratio/max": 1.483763337135315, "sampling/importance_sampling_ratio/mean": 0.27704240878423053, "sampling/importance_sampling_ratio/min": 4.472380654381899e-05, "sampling/sampling_logp_difference/max": 21.502643585205078, "sampling/sampling_logp_difference/mean": 0.004212147401024898, "step": 590, "step_time": 10.553433235548436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2665.0, "completions/mean_length": 2164.140625, "completions/mean_terminated_length": 924.3169860839844, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "entropy": 0.0204102685675025, "epoch": 0.07211538461538461, "frac_reward_zero_std": 0.0, "grad_norm": 0.007603239268064499, "learning_rate": 9.280048076923076e-07, "loss": 0.0044, "num_tokens": 15998646.0, "reward": 0.5728678107261658, "reward_std": 0.30791547894477844, "rewards/reward_fn/mean": 0.5728678107261658, "rewards/reward_fn/std": 0.30791546404361725, "sampling/importance_sampling_ratio/max": 1.8188266158103943, "sampling/importance_sampling_ratio/mean": 0.21875887364149094, "sampling/importance_sampling_ratio/min": 0.00019893934586434625, "sampling/sampling_logp_difference/max": 2.8420323729515076, "sampling/sampling_logp_difference/mean": 0.003914974047802389, "step": 600, "step_time": 7.239077177736908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6666666666666666, "completions/max_length": 3000.0, "completions/max_terminated_length": 2734.0, "completions/mean_length": 2280.7604166666665, "completions/mean_terminated_length": 835.6885782877604, "completions/min_length": 226.33333333333334, "completions/min_terminated_length": 226.33333333333334, "entropy": 0.019553444534540176, "epoch": 0.0733173076923077, "frac_reward_zero_std": 0.0, "grad_norm": 0.0014798427000641823, "learning_rate": 9.268028846153846e-07, "loss": -0.0011, "num_tokens": 16344439.0, "reward": 0.5146355430285136, "reward_std": 0.2684357762336731, "rewards/reward_fn/mean": 0.5146355430285136, "rewards/reward_fn/std": 0.2684357762336731, "sampling/importance_sampling_ratio/max": 1.0675615668296814, "sampling/importance_sampling_ratio/mean": 0.1653403788805008, "sampling/importance_sampling_ratio/min": 5.7966924183953474e-05, "sampling/sampling_logp_difference/max": 3.6445116996765137, "sampling/sampling_logp_difference/mean": 0.0040693217888474464, "step": 610, "step_time": 10.57422553151846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.578125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2000.0, "completions/mean_length": 2029.234375, "completions/mean_terminated_length": 698.3434448242188, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.01919368365779519, "epoch": 0.07451923076923077, "frac_reward_zero_std": 0.0, "grad_norm": 0.011961231008172035, "learning_rate": 9.256009615384615e-07, "loss": 0.0009, "num_tokens": 16543278.0, "reward": 0.5238810777664185, "reward_std": 0.29847726225852966, "rewards/reward_fn/mean": 0.5238810777664185, "rewards/reward_fn/std": 0.29847727715969086, "sampling/importance_sampling_ratio/max": 0.9981687366962433, "sampling/importance_sampling_ratio/mean": 0.17846642434597015, "sampling/importance_sampling_ratio/min": 6.530275823024567e-05, "sampling/sampling_logp_difference/max": 2.4114463925361633, "sampling/sampling_logp_difference/mean": 0.003635531524196267, "step": 620, "step_time": 7.243306466005743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4479166666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 2144.6666666666665, "completions/mean_length": 1573.2291666666667, "completions/mean_terminated_length": 417.2685241699219, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.022404390946030618, "epoch": 0.07572115384615384, "frac_reward_zero_std": 0.0, "grad_norm": 0.0022971131838858128, "learning_rate": 9.243990384615384e-07, "loss": 0.0033, "num_tokens": 16799876.0, "reward": 0.6060110131899515, "reward_std": 0.29353410998980206, "rewards/reward_fn/mean": 0.6060110131899515, "rewards/reward_fn/std": 0.2935341000556946, "sampling/importance_sampling_ratio/max": 1.3760488430658977, "sampling/importance_sampling_ratio/mean": 0.3072710732618968, "sampling/importance_sampling_ratio/min": 0.00012303252394000688, "sampling/sampling_logp_difference/max": 3.489841858545939, "sampling/sampling_logp_difference/mean": 0.004301524410645167, "step": 630, "step_time": 10.424022000096738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 3000.0, "completions/max_terminated_length": 2351.0, "completions/mean_length": 1617.796875, "completions/mean_terminated_length": 656.9107208251953, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.01985338320955634, "epoch": 0.07692307692307693, "frac_reward_zero_std": 0.125, "grad_norm": 0.00191899121273309, "learning_rate": 9.231971153846153e-07, "loss": -0.0054, "num_tokens": 16963935.0, "reward": 0.4917537271976471, "reward_std": 0.30399206280708313, "rewards/reward_fn/mean": 0.4917537271976471, "rewards/reward_fn/std": 0.30399206280708313, "sampling/importance_sampling_ratio/max": 1.0268234610557556, "sampling/importance_sampling_ratio/mean": 0.3051586076617241, "sampling/importance_sampling_ratio/min": 4.3538804675335996e-05, "sampling/sampling_logp_difference/max": 3.2685306072235107, "sampling/sampling_logp_difference/mean": 0.003917212947271764, "step": 640, "step_time": 6.97787937251851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5208333333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 1628.0, "completions/mean_length": 1835.2291666666667, "completions/mean_terminated_length": 579.4583333333334, "completions/min_length": 195.66666666666666, "completions/min_terminated_length": 195.66666666666666, "entropy": 0.01963441213592887, "epoch": 0.078125, "frac_reward_zero_std": 0.0, "grad_norm": 0.0071786693297326565, "learning_rate": 9.219951923076923e-07, "loss": -0.0035, "num_tokens": 17248509.0, "reward": 0.5396906137466431, "reward_std": 0.29780300458272296, "rewards/reward_fn/mean": 0.5396906137466431, "rewards/reward_fn/std": 0.29780300458272296, "sampling/importance_sampling_ratio/max": 1.2491965889930725, "sampling/importance_sampling_ratio/mean": 0.263188103834788, "sampling/importance_sampling_ratio/min": 5.144042552274186e-05, "sampling/sampling_logp_difference/max": 4.085236390431722, "sampling/sampling_logp_difference/mean": 0.003950289683416486, "step": 650, "step_time": 10.508196210861206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2687.0, "completions/mean_length": 1740.125, "completions/mean_terminated_length": 552.6764831542969, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.018791571073234082, "epoch": 0.07932692307692307, "frac_reward_zero_std": 0.0, "grad_norm": 0.002789914608001709, "learning_rate": 9.207932692307691e-07, "loss": 0.0001, "num_tokens": 17421373.0, "reward": 0.6073318719863892, "reward_std": 0.2965495139360428, "rewards/reward_fn/mean": 0.6073318719863892, "rewards/reward_fn/std": 0.2965495139360428, "sampling/importance_sampling_ratio/max": 1.256477266550064, "sampling/importance_sampling_ratio/mean": 0.3074912428855896, "sampling/importance_sampling_ratio/min": 1.079064486475545e-05, "sampling/sampling_logp_difference/max": 5.354617774486542, "sampling/sampling_logp_difference/mean": 0.0038837387692183256, "step": 660, "step_time": 7.12276476919651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6041666666666666, "completions/max_length": 3000.0, "completions/max_terminated_length": 2312.6666666666665, "completions/mean_length": 2116.9479166666665, "completions/mean_terminated_length": 765.8596598307291, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "entropy": 0.019182747323065998, "epoch": 0.08052884615384616, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.006521002855151892, "learning_rate": 9.195913461538462e-07, "loss": 0.0025, "num_tokens": 17719712.0, "reward": 0.5179949601491293, "reward_std": 0.3033226529757182, "rewards/reward_fn/mean": 0.5179949601491293, "rewards/reward_fn/std": 0.3033226629098256, "sampling/importance_sampling_ratio/max": 1.1238491137822468, "sampling/importance_sampling_ratio/mean": 0.1916132022937139, "sampling/importance_sampling_ratio/min": 0.0001727942378882593, "sampling/sampling_logp_difference/max": 5.004186471303304, "sampling/sampling_logp_difference/mean": 0.003844886009270946, "step": 670, "step_time": 10.383300797268749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.671875, "completions/max_length": 3000.0, "completions/max_terminated_length": 1319.0, "completions/mean_length": 2227.5, "completions/mean_terminated_length": 622.8026428222656, "completions/min_length": 183.5, "completions/min_terminated_length": 183.5, "entropy": 0.020831457804888487, "epoch": 0.08173076923076923, "frac_reward_zero_std": 0.0, "grad_norm": 0.0093160979449749, "learning_rate": 9.18389423076923e-07, "loss": -0.0031, "num_tokens": 17942872.0, "reward": 0.5221043974161148, "reward_std": 0.24484867602586746, "rewards/reward_fn/mean": 0.5221043974161148, "rewards/reward_fn/std": 0.24484866112470627, "sampling/importance_sampling_ratio/max": 0.846078559756279, "sampling/importance_sampling_ratio/mean": 0.13924792036414146, "sampling/importance_sampling_ratio/min": 1.5529393154523063e-05, "sampling/sampling_logp_difference/max": 3.8047399520874023, "sampling/sampling_logp_difference/mean": 0.004105557221919298, "step": 680, "step_time": 7.26061023240909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6979166666666666, "completions/max_length": 3000.0, "completions/max_terminated_length": 2194.3333333333335, "completions/mean_length": 2372.6666666666665, "completions/mean_terminated_length": 905.3928833007812, "completions/min_length": 199.66666666666666, "completions/min_terminated_length": 199.66666666666666, "entropy": 0.019637163914740085, "epoch": 0.0829326923076923, "frac_reward_zero_std": 0.0, "grad_norm": 0.007457136642187834, "learning_rate": 9.171875e-07, "loss": -0.0015, "num_tokens": 18359264.0, "reward": 0.511309027671814, "reward_std": 0.2952939768632253, "rewards/reward_fn/mean": 0.511309027671814, "rewards/reward_fn/std": 0.2952939569950104, "sampling/importance_sampling_ratio/max": 1.4550379316012065, "sampling/importance_sampling_ratio/mean": 0.11285052945216496, "sampling/importance_sampling_ratio/min": 4.621858152304796e-06, "sampling/sampling_logp_difference/max": 4.833513498306274, "sampling/sampling_logp_difference/mean": 0.004586468295504649, "step": 690, "step_time": 11.65839467747137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 3000.0, "completions/max_terminated_length": 1502.5, "completions/mean_length": 1710.375, "completions/mean_terminated_length": 424.2024383544922, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.020512032974511384, "epoch": 0.08413461538461539, "frac_reward_zero_std": 0.125, "grad_norm": 0.0026555899530649185, "learning_rate": 9.159855769230769e-07, "loss": -0.0025, "num_tokens": 18533832.0, "reward": 0.4933992773294449, "reward_std": 0.32547715306282043, "rewards/reward_fn/mean": 0.4933992773294449, "rewards/reward_fn/std": 0.325477197766304, "sampling/importance_sampling_ratio/max": 1.2727237343788147, "sampling/importance_sampling_ratio/mean": 0.2944595664739609, "sampling/importance_sampling_ratio/min": 7.462767644028645e-05, "sampling/sampling_logp_difference/max": 2.287927508354187, "sampling/sampling_logp_difference/mean": 0.0037258543306961656, "step": 700, "step_time": 7.750468777492642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 3000.0, "completions/max_terminated_length": 1463.3333333333333, "completions/mean_length": 1944.8958333333333, "completions/mean_terminated_length": 543.6111195882162, "completions/min_length": 199.66666666666666, "completions/min_terminated_length": 199.66666666666666, "entropy": 0.017059090454131363, "epoch": 0.08533653846153846, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.0018933776300400496, "learning_rate": 9.147836538461538e-07, "loss": 0.0077, "num_tokens": 18841918.0, "reward": 0.513282060623169, "reward_std": 0.2788737614949544, "rewards/reward_fn/mean": 0.513282060623169, "rewards/reward_fn/std": 0.2788737465937932, "sampling/importance_sampling_ratio/max": 0.9605980962514877, "sampling/importance_sampling_ratio/mean": 0.2933625529209773, "sampling/importance_sampling_ratio/min": 2.475707345322986e-05, "sampling/sampling_logp_difference/max": 5.894338289896647, "sampling/sampling_logp_difference/mean": 0.0036804194872577987, "step": 710, "step_time": 10.432419973239302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.765625, "completions/max_length": 3000.0, "completions/max_terminated_length": 1699.0, "completions/mean_length": 2420.28125, "completions/mean_terminated_length": 607.8333587646484, "completions/min_length": 217.5, "completions/min_terminated_length": 217.5, "entropy": 0.018398384004831313, "epoch": 0.08653846153846154, "frac_reward_zero_std": 0.0, "grad_norm": 0.0010068644769489765, "learning_rate": 9.135817307692308e-07, "loss": -0.0028, "num_tokens": 19074440.0, "reward": 0.48434993624687195, "reward_std": 0.22759928554296494, "rewards/reward_fn/mean": 0.48434993624687195, "rewards/reward_fn/std": 0.22759928554296494, "sampling/importance_sampling_ratio/max": 0.8110776841640472, "sampling/importance_sampling_ratio/mean": 0.14336800761520863, "sampling/importance_sampling_ratio/min": 0.0004230875347275287, "sampling/sampling_logp_difference/max": 3.5498058795928955, "sampling/sampling_logp_difference/mean": 0.003728823969140649, "step": 720, "step_time": 7.2621736258268355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5416666666666666, "completions/max_length": 3000.0, "completions/max_terminated_length": 1604.6666666666667, "completions/mean_length": 1843.9479166666667, "completions/mean_terminated_length": 516.1387430826823, "completions/min_length": 152.66666666666666, "completions/min_terminated_length": 152.66666666666666, "entropy": 0.018639271799474954, "epoch": 0.08774038461538461, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.0008005331619642675, "learning_rate": 9.123798076923076e-07, "loss": -0.0031, "num_tokens": 19381659.0, "reward": 0.535393605629603, "reward_std": 0.29793277382850647, "rewards/reward_fn/mean": 0.535393605629603, "rewards/reward_fn/std": 0.29793277382850647, "sampling/importance_sampling_ratio/max": 1.0816890597343445, "sampling/importance_sampling_ratio/mean": 0.2543373604615529, "sampling/importance_sampling_ratio/min": 0.00010582089938300972, "sampling/sampling_logp_difference/max": 2.620308001836141, "sampling/sampling_logp_difference/mean": 0.0038731231664617858, "step": 730, "step_time": 10.611787948291749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2223.0, "completions/mean_length": 1392.234375, "completions/mean_terminated_length": 494.17149353027344, "completions/min_length": 142.5, "completions/min_terminated_length": 142.5, "entropy": 0.02185629736632109, "epoch": 0.0889423076923077, "frac_reward_zero_std": 0.125, "grad_norm": 0.0033056505490094423, "learning_rate": 9.111778846153846e-07, "loss": -0.0043, "num_tokens": 19543114.0, "reward": 0.56279057264328, "reward_std": 0.3192734569311142, "rewards/reward_fn/mean": 0.56279057264328, "rewards/reward_fn/std": 0.3192734569311142, "sampling/importance_sampling_ratio/max": 1.8793782591819763, "sampling/importance_sampling_ratio/mean": 0.34550438821315765, "sampling/importance_sampling_ratio/min": 6.048540490155574e-05, "sampling/sampling_logp_difference/max": 2.9145283699035645, "sampling/sampling_logp_difference/mean": 0.004359193379059434, "step": 740, "step_time": 7.044930969830602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 3000.0, "completions/max_terminated_length": 1573.6666666666667, "completions/mean_length": 1966.71875, "completions/mean_terminated_length": 680.7994893391927, "completions/min_length": 234.66666666666666, "completions/min_terminated_length": 234.66666666666666, "entropy": 0.021756542287766932, "epoch": 0.09014423076923077, "frac_reward_zero_std": 0.0, "grad_norm": 0.003088445635512471, "learning_rate": 9.099759615384615e-07, "loss": 0.0002, "num_tokens": 19848207.0, "reward": 0.5814730326334635, "reward_std": 0.2850917677084605, "rewards/reward_fn/mean": 0.5814730326334635, "rewards/reward_fn/std": 0.28509177764256793, "sampling/importance_sampling_ratio/max": 1.1962996025880177, "sampling/importance_sampling_ratio/mean": 0.20166579882303873, "sampling/importance_sampling_ratio/min": 6.281016279293301e-05, "sampling/sampling_logp_difference/max": 2.718536615371704, "sampling/sampling_logp_difference/mean": 0.00431708541388313, "step": 750, "step_time": 10.556610192079097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.640625, "completions/max_length": 3000.0, "completions/max_terminated_length": 1173.5, "completions/mean_length": 2133.796875, "completions/mean_terminated_length": 599.4015502929688, "completions/min_length": 263.5, "completions/min_terminated_length": 263.5, "entropy": 0.018736871145665645, "epoch": 0.09134615384615384, "frac_reward_zero_std": 0.0, "grad_norm": 0.005688189063221216, "learning_rate": 9.087740384615384e-07, "loss": 0.0014, "num_tokens": 20060042.0, "reward": 0.47630974650382996, "reward_std": 0.28866395354270935, "rewards/reward_fn/mean": 0.47630974650382996, "rewards/reward_fn/std": 0.28866396844387054, "sampling/importance_sampling_ratio/max": 1.041463851928711, "sampling/importance_sampling_ratio/mean": 0.1979226991534233, "sampling/importance_sampling_ratio/min": 7.873897089893944e-07, "sampling/sampling_logp_difference/max": 6.599092245101929, "sampling/sampling_logp_difference/mean": 0.0038496492197737098, "step": 760, "step_time": 7.219053522869944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4895833333333333, "completions/max_length": 3000.0, "completions/max_terminated_length": 1494.0, "completions/mean_length": 1684.875, "completions/mean_terminated_length": 455.02381388346356, "completions/min_length": 177.33333333333334, "completions/min_terminated_length": 177.33333333333334, "entropy": 0.02246035672724247, "epoch": 0.09254807692307693, "frac_reward_zero_std": 0.0, "grad_norm": 0.005384939257055521, "learning_rate": 9.075721153846153e-07, "loss": -0.0009, "num_tokens": 20341630.0, "reward": 0.6248141328493754, "reward_std": 0.27725750207901, "rewards/reward_fn/mean": 0.6248141328493754, "rewards/reward_fn/std": 0.27725750207901, "sampling/importance_sampling_ratio/max": 1.5687095920244853, "sampling/importance_sampling_ratio/mean": 0.30778169135252637, "sampling/importance_sampling_ratio/min": 7.956366001356703e-05, "sampling/sampling_logp_difference/max": 3.8584539890289307, "sampling/sampling_logp_difference/mean": 0.004646194788316886, "step": 770, "step_time": 10.534044423606247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.578125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2228.5, "completions/mean_length": 1987.984375, "completions/mean_terminated_length": 579.5411987304688, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.019817699678242207, "epoch": 0.09375, "frac_reward_zero_std": 0.0, "grad_norm": 0.013464679010212421, "learning_rate": 9.063701923076923e-07, "loss": 0.0009, "num_tokens": 20542949.0, "reward": 0.5833176672458649, "reward_std": 0.28237713873386383, "rewards/reward_fn/mean": 0.5833176672458649, "rewards/reward_fn/std": 0.28237710893154144, "sampling/importance_sampling_ratio/max": 1.9805582761764526, "sampling/importance_sampling_ratio/mean": 0.2596963196992874, "sampling/importance_sampling_ratio/min": 0.00010447373460920062, "sampling/sampling_logp_difference/max": 3.7474499940872192, "sampling/sampling_logp_difference/mean": 0.003997455700300634, "step": 780, "step_time": 7.1740713095292445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5520833333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 1736.6666666666667, "completions/mean_length": 1964.0520833333333, "completions/mean_terminated_length": 707.9602864583334, "completions/min_length": 132.66666666666666, "completions/min_terminated_length": 132.66666666666666, "entropy": 0.02114916518330574, "epoch": 0.09495192307692307, "frac_reward_zero_std": 0.0, "grad_norm": 0.004186716396361589, "learning_rate": 9.051682692307692e-07, "loss": 0.0034, "num_tokens": 20839962.0, "reward": 0.594942569732666, "reward_std": 0.2703310251235962, "rewards/reward_fn/mean": 0.594942569732666, "rewards/reward_fn/std": 0.2703310151894887, "sampling/importance_sampling_ratio/max": 1.4414838949839275, "sampling/importance_sampling_ratio/mean": 0.20196766157944998, "sampling/importance_sampling_ratio/min": 2.041707800041574e-05, "sampling/sampling_logp_difference/max": 3.6781795819600425, "sampling/sampling_logp_difference/mean": 0.004118229650581877, "step": 790, "step_time": 10.483766446262598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 3000.0, "completions/max_terminated_length": 1924.0, "completions/mean_length": 1731.171875, "completions/mean_terminated_length": 533.9174194335938, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.02057642173022032, "epoch": 0.09615384615384616, "frac_reward_zero_std": 0.0, "grad_norm": 0.003731030272319913, "learning_rate": 9.039663461538461e-07, "loss": -0.0067, "num_tokens": 21017781.0, "reward": 0.6239477097988129, "reward_std": 0.2798934578895569, "rewards/reward_fn/mean": 0.6239477097988129, "rewards/reward_fn/std": 0.2798934429883957, "sampling/importance_sampling_ratio/max": 0.9567515254020691, "sampling/importance_sampling_ratio/mean": 0.23725122213363647, "sampling/importance_sampling_ratio/min": 0.00020800563288503326, "sampling/sampling_logp_difference/max": 3.639017879962921, "sampling/sampling_logp_difference/mean": 0.004453250905498862, "step": 800, "step_time": 7.187580725364387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 3000.0, "completions/max_terminated_length": 2038.6666666666667, "completions/mean_length": 1931.6875, "completions/mean_terminated_length": 724.3693237304688, "completions/min_length": 186.33333333333334, "completions/min_terminated_length": 186.33333333333334, "entropy": 0.02057900931686163, "epoch": 0.09735576923076923, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.005419518332928419, "learning_rate": 9.027644230769231e-07, "loss": 0.001, "num_tokens": 21310031.0, "reward": 0.5740555723508199, "reward_std": 0.2590402414401372, "rewards/reward_fn/mean": 0.5740555723508199, "rewards/reward_fn/std": 0.2590402215719223, "sampling/importance_sampling_ratio/max": 1.1908231973648071, "sampling/importance_sampling_ratio/mean": 0.2821030480166276, "sampling/importance_sampling_ratio/min": 6.0931165544767886e-05, "sampling/sampling_logp_difference/max": 3.7055761019388833, "sampling/sampling_logp_difference/mean": 0.0042450168790916605, "step": 810, "step_time": 10.483284460194408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.578125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2231.5, "completions/mean_length": 2071.1875, "completions/mean_terminated_length": 845.3588256835938, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "entropy": 0.01782824080437422, "epoch": 0.0985576923076923, "frac_reward_zero_std": 0.0, "grad_norm": 0.027781786397099495, "learning_rate": 9.015625e-07, "loss": 0.0096, "num_tokens": 21506403.0, "reward": 0.5625352263450623, "reward_std": 0.3009888380765915, "rewards/reward_fn/mean": 0.5625352263450623, "rewards/reward_fn/std": 0.3009888529777527, "sampling/importance_sampling_ratio/max": 1.2007531821727753, "sampling/importance_sampling_ratio/mean": 0.197459414601326, "sampling/importance_sampling_ratio/min": 6.915833182574715e-05, "sampling/sampling_logp_difference/max": 2.8208706378936768, "sampling/sampling_logp_difference/mean": 0.003923515789210796, "step": 820, "step_time": 7.1390301316976545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2007.3333333333333, "completions/mean_length": 1794.9270833333333, "completions/mean_terminated_length": 508.2215627034505, "completions/min_length": 117.33333333333333, "completions/min_terminated_length": 117.33333333333333, "entropy": 0.019712616968899966, "epoch": 0.09975961538461539, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.021940914914011955, "learning_rate": 9.003605769230768e-07, "loss": -0.0013, "num_tokens": 21795140.0, "reward": 0.5027588407198588, "reward_std": 0.3300045132637024, "rewards/reward_fn/mean": 0.5027588407198588, "rewards/reward_fn/std": 0.3300044933954875, "sampling/importance_sampling_ratio/max": 1.121954321861267, "sampling/importance_sampling_ratio/mean": 0.3059955835342407, "sampling/importance_sampling_ratio/min": 3.9832278349175944e-05, "sampling/sampling_logp_difference/max": 5.118773380915324, "sampling/sampling_logp_difference/mean": 0.003935306255395214, "step": 830, "step_time": 10.409306166972964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2222.5, "completions/mean_length": 1818.0625, "completions/mean_terminated_length": 842.9424438476562, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.02087469520047307, "epoch": 0.10096153846153846, "frac_reward_zero_std": 0.0, "grad_norm": 0.004141318611800671, "learning_rate": 8.991586538461539e-07, "loss": 0.0015, "num_tokens": 21972712.0, "reward": 0.5787269175052643, "reward_std": 0.3007924258708954, "rewards/reward_fn/mean": 0.5787269175052643, "rewards/reward_fn/std": 0.3007924258708954, "sampling/importance_sampling_ratio/max": 1.071198731660843, "sampling/importance_sampling_ratio/mean": 0.2060057893395424, "sampling/importance_sampling_ratio/min": 1.863632178356056e-06, "sampling/sampling_logp_difference/max": 1.915416419506073, "sampling/sampling_logp_difference/mean": 0.004536935361102223, "step": 840, "step_time": 7.1170178558677435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 3000.0, "completions/max_terminated_length": 1839.6666666666667, "completions/mean_length": 1973.6875, "completions/mean_terminated_length": 690.3055826822916, "completions/min_length": 174.66666666666666, "completions/min_terminated_length": 174.66666666666666, "entropy": 0.019836829602718355, "epoch": 0.10216346153846154, "frac_reward_zero_std": 0.0, "grad_norm": 0.006693142466247082, "learning_rate": 8.979567307692307e-07, "loss": -0.0028, "num_tokens": 22273082.0, "reward": 0.6106228033701578, "reward_std": 0.26888884603977203, "rewards/reward_fn/mean": 0.6106228033701578, "rewards/reward_fn/std": 0.26888883610566455, "sampling/importance_sampling_ratio/max": 1.0655073126157124, "sampling/importance_sampling_ratio/mean": 0.22743728756904602, "sampling/importance_sampling_ratio/min": 9.335880410314228e-05, "sampling/sampling_logp_difference/max": 4.5512746175130205, "sampling/sampling_logp_difference/mean": 0.004258006655921538, "step": 850, "step_time": 10.490174485556782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2272.0, "completions/mean_length": 1981.21875, "completions/mean_terminated_length": 863.7083435058594, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "entropy": 0.019394595362246036, "epoch": 0.10336538461538461, "frac_reward_zero_std": 0.0, "grad_norm": 0.002174183027818799, "learning_rate": 8.967548076923076e-07, "loss": -0.002, "num_tokens": 22469920.0, "reward": 0.6039811670780182, "reward_std": 0.29271097481250763, "rewards/reward_fn/mean": 0.6039811670780182, "rewards/reward_fn/std": 0.29271095991134644, "sampling/importance_sampling_ratio/max": 1.2017822861671448, "sampling/importance_sampling_ratio/mean": 0.2075396291911602, "sampling/importance_sampling_ratio/min": 0.00012080024680471979, "sampling/sampling_logp_difference/max": 4.129118204116821, "sampling/sampling_logp_difference/mean": 0.003932291641831398, "step": 860, "step_time": 7.275878791138529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4479166666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 1865.0, "completions/mean_length": 1619.8958333333333, "completions/mean_terminated_length": 530.1700439453125, "completions/min_length": 177.33333333333334, "completions/min_terminated_length": 177.33333333333334, "entropy": 0.02140417378395796, "epoch": 0.1045673076923077, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.0022497575264424086, "learning_rate": 8.955528846153846e-07, "loss": -0.0072, "num_tokens": 22736494.0, "reward": 0.5606730381647745, "reward_std": 0.3224658668041229, "rewards/reward_fn/mean": 0.5606730381647745, "rewards/reward_fn/std": 0.3224658767382304, "sampling/importance_sampling_ratio/max": 1.2161237796147664, "sampling/importance_sampling_ratio/mean": 0.3330496648947398, "sampling/importance_sampling_ratio/min": 2.7214856932763116e-05, "sampling/sampling_logp_difference/max": 4.253745953241984, "sampling/sampling_logp_difference/mean": 0.004306529648602009, "step": 870, "step_time": 10.4833337479271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2562.5, "completions/mean_length": 1653.890625, "completions/mean_terminated_length": 629.5749969482422, "completions/min_length": 105.5, "completions/min_terminated_length": 105.5, "entropy": 0.018865276128053665, "epoch": 0.10576923076923077, "frac_reward_zero_std": 0.0, "grad_norm": 0.0063938237726688385, "learning_rate": 8.943509615384614e-07, "loss": 0.0005, "num_tokens": 22934103.0, "reward": 0.6075493097305298, "reward_std": 0.31934112310409546, "rewards/reward_fn/mean": 0.6075493097305298, "rewards/reward_fn/std": 0.31934112310409546, "sampling/importance_sampling_ratio/max": 1.3710907697677612, "sampling/importance_sampling_ratio/mean": 0.33152295649051666, "sampling/importance_sampling_ratio/min": 0.00011138573427160736, "sampling/sampling_logp_difference/max": 3.7473549246788025, "sampling/sampling_logp_difference/mean": 0.003580800723284483, "step": 880, "step_time": 7.294528009742498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 3000.0, "completions/max_terminated_length": 1696.0, "completions/mean_length": 1354.6041666666667, "completions/mean_terminated_length": 393.9265848795573, "completions/min_length": 177.66666666666666, "completions/min_terminated_length": 177.66666666666666, "entropy": 0.018612127844244243, "epoch": 0.10697115384615384, "frac_reward_zero_std": 0.25, "grad_norm": 0.004806436598300934, "learning_rate": 8.931490384615385e-07, "loss": -0.0079, "num_tokens": 23163385.0, "reward": 0.601132353146871, "reward_std": 0.3252565066019694, "rewards/reward_fn/mean": 0.601132353146871, "rewards/reward_fn/std": 0.3252565066019694, "sampling/importance_sampling_ratio/max": 0.9897635181744894, "sampling/importance_sampling_ratio/mean": 0.37257973353068036, "sampling/importance_sampling_ratio/min": 7.538027754587044e-05, "sampling/sampling_logp_difference/max": 3.1822074254353843, "sampling/sampling_logp_difference/mean": 0.003966478941341241, "step": 890, "step_time": 10.326582074537873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.578125, "completions/max_length": 3000.0, "completions/max_terminated_length": 1479.5, "completions/mean_length": 1896.859375, "completions/mean_terminated_length": 373.93824768066406, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.019445508159697054, "epoch": 0.10817307692307693, "frac_reward_zero_std": 0.0, "grad_norm": 0.005159351509064436, "learning_rate": 8.919471153846153e-07, "loss": 0.0003, "num_tokens": 23356712.0, "reward": 0.5994369685649872, "reward_std": 0.2813599854707718, "rewards/reward_fn/mean": 0.5994369685649872, "rewards/reward_fn/std": 0.2813599929213524, "sampling/importance_sampling_ratio/max": 1.1603797376155853, "sampling/importance_sampling_ratio/mean": 0.27624987810850143, "sampling/importance_sampling_ratio/min": 7.359326446021441e-06, "sampling/sampling_logp_difference/max": 3.2975656986236572, "sampling/sampling_logp_difference/mean": 0.004031449439935386, "step": 900, "step_time": 7.1527102465741335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4583333333333333, "completions/max_length": 3000.0, "completions/max_terminated_length": 1307.3333333333333, "completions/mean_length": 1544.3958333333333, "completions/mean_terminated_length": 324.3948262532552, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.01705426648259163, "epoch": 0.109375, "frac_reward_zero_std": 0.16666666666666666, "grad_norm": 0.014491376467049122, "learning_rate": 8.907451923076923e-07, "loss": 0.0008, "num_tokens": 23597574.0, "reward": 0.6258066892623901, "reward_std": 0.2883763561646144, "rewards/reward_fn/mean": 0.6258066892623901, "rewards/reward_fn/std": 0.2883763611316681, "sampling/importance_sampling_ratio/max": 1.3109796444574993, "sampling/importance_sampling_ratio/mean": 0.39834457635879517, "sampling/importance_sampling_ratio/min": 0.0012040818553297565, "sampling/sampling_logp_difference/max": 3.0049490928649902, "sampling/sampling_logp_difference/mean": 0.0035605190011362233, "step": 910, "step_time": 10.223639958072454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 3000.0, "completions/max_terminated_length": 1901.5, "completions/mean_length": 1826.875, "completions/mean_terminated_length": 579.7129516601562, "completions/min_length": 153.5, "completions/min_terminated_length": 153.5, "entropy": 0.01792654376477003, "epoch": 0.11057692307692307, "frac_reward_zero_std": 0.125, "grad_norm": 0.0007909393752925098, "learning_rate": 8.895432692307692e-07, "loss": -0.0062, "num_tokens": 23781766.0, "reward": 0.3946477174758911, "reward_std": 0.33406612277030945, "rewards/reward_fn/mean": 0.3946477174758911, "rewards/reward_fn/std": 0.33406612277030945, "sampling/importance_sampling_ratio/max": 1.051205039024353, "sampling/importance_sampling_ratio/mean": 0.28133077174425125, "sampling/importance_sampling_ratio/min": 4.389360583445523e-05, "sampling/sampling_logp_difference/max": 3.6680057048797607, "sampling/sampling_logp_difference/mean": 0.00368997675832361, "step": 920, "step_time": 7.14753420073539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5104166666666666, "completions/max_length": 3000.0, "completions/max_terminated_length": 1877.0, "completions/mean_length": 1807.2083333333333, "completions/mean_terminated_length": 554.2799072265625, "completions/min_length": 165.33333333333334, "completions/min_terminated_length": 165.33333333333334, "entropy": 0.02100477274507284, "epoch": 0.11177884615384616, "frac_reward_zero_std": 0.0, "grad_norm": 0.0013811811804771423, "learning_rate": 8.883413461538462e-07, "loss": 0.0006, "num_tokens": 24070554.0, "reward": 0.618011474609375, "reward_std": 0.29499903321266174, "rewards/reward_fn/mean": 0.618011474609375, "rewards/reward_fn/std": 0.29499903321266174, "sampling/importance_sampling_ratio/max": 1.5378629366556804, "sampling/importance_sampling_ratio/mean": 0.24361814558506012, "sampling/importance_sampling_ratio/min": 0.0001284922397341385, "sampling/sampling_logp_difference/max": 2.8808202346165976, "sampling/sampling_logp_difference/mean": 0.00437823651979367, "step": 930, "step_time": 10.461027862969786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.515625, "completions/max_length": 3000.0, "completions/max_terminated_length": 2811.5, "completions/mean_length": 1795.171875, "completions/mean_terminated_length": 592.7934875488281, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.02003017421811819, "epoch": 0.11298076923076923, "frac_reward_zero_std": 0.0, "grad_norm": 0.007183162495493889, "learning_rate": 8.87139423076923e-07, "loss": -0.0032, "num_tokens": 24276717.0, "reward": 0.5286199450492859, "reward_std": 0.30620574951171875, "rewards/reward_fn/mean": 0.5286199450492859, "rewards/reward_fn/std": 0.30620574951171875, "sampling/importance_sampling_ratio/max": 1.0594962239265442, "sampling/importance_sampling_ratio/mean": 0.30247754603624344, "sampling/importance_sampling_ratio/min": 8.37530718627022e-05, "sampling/sampling_logp_difference/max": 4.346694231033325, "sampling/sampling_logp_difference/mean": 0.0037703944835811853, "step": 940, "step_time": 7.301756392512471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5520833333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 1578.0, "completions/mean_length": 1862.65625, "completions/mean_terminated_length": 458.3776041666667, "completions/min_length": 157.66666666666666, "completions/min_terminated_length": 157.66666666666666, "entropy": 0.02119620405137539, "epoch": 0.1141826923076923, "frac_reward_zero_std": 0.0, "grad_norm": 0.00728571368381381, "learning_rate": 8.859375e-07, "loss": 0.0001, "num_tokens": 24577964.0, "reward": 0.6054138938585917, "reward_std": 0.27796850601832074, "rewards/reward_fn/mean": 0.6054138938585917, "rewards/reward_fn/std": 0.27796849608421326, "sampling/importance_sampling_ratio/max": 1.482273022333781, "sampling/importance_sampling_ratio/mean": 0.2682342082262039, "sampling/importance_sampling_ratio/min": 0.00012676521674848118, "sampling/sampling_logp_difference/max": 5.331699411074321, "sampling/sampling_logp_difference/mean": 0.0040924435791869955, "step": 950, "step_time": 10.590890022553504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2376.0, "completions/mean_length": 1936.15625, "completions/mean_terminated_length": 750.9910888671875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.01993333427235484, "epoch": 0.11538461538461539, "frac_reward_zero_std": 0.0, "grad_norm": 0.007642277982085943, "learning_rate": 8.847355769230769e-07, "loss": -0.007, "num_tokens": 24777742.0, "reward": 0.6039822101593018, "reward_std": 0.29520365595817566, "rewards/reward_fn/mean": 0.6039822101593018, "rewards/reward_fn/std": 0.29520364105701447, "sampling/importance_sampling_ratio/max": 1.2725469172000885, "sampling/importance_sampling_ratio/mean": 0.23597021400928497, "sampling/importance_sampling_ratio/min": 8.415197589783929e-05, "sampling/sampling_logp_difference/max": 2.128592848777771, "sampling/sampling_logp_difference/mean": 0.004328221315518022, "step": 960, "step_time": 7.231234591826796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 3000.0, "completions/max_terminated_length": 2444.3333333333335, "completions/mean_length": 1820.59375, "completions/mean_terminated_length": 675.6708374023438, "completions/min_length": 154.66666666666666, "completions/min_terminated_length": 154.66666666666666, "entropy": 0.020807130821049215, "epoch": 0.11658653846153846, "frac_reward_zero_std": 0.0, "grad_norm": 0.00907782930880785, "learning_rate": 8.835336538461538e-07, "loss": -0.0013, "num_tokens": 25055551.0, "reward": 0.6422503391901652, "reward_std": 0.26396941641966504, "rewards/reward_fn/mean": 0.6422503391901652, "rewards/reward_fn/std": 0.26396942138671875, "sampling/importance_sampling_ratio/max": 1.13424551486969, "sampling/importance_sampling_ratio/mean": 0.2064783920844396, "sampling/importance_sampling_ratio/min": 0.00012729928554714812, "sampling/sampling_logp_difference/max": 3.243707259496053, "sampling/sampling_logp_difference/mean": 0.004386583498368661, "step": 970, "step_time": 10.501741997804492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.515625, "completions/max_length": 3000.0, "completions/max_terminated_length": 1026.0, "completions/mean_length": 1711.546875, "completions/mean_terminated_length": 340.6491394042969, "completions/min_length": 139.5, "completions/min_terminated_length": 139.5, "entropy": 0.02118786480277777, "epoch": 0.11778846153846154, "frac_reward_zero_std": 0.125, "grad_norm": 0.008956641890108585, "learning_rate": 8.823317307692308e-07, "loss": 0.0005, "num_tokens": 25239450.0, "reward": 0.5236629545688629, "reward_std": 0.29252470284700394, "rewards/reward_fn/mean": 0.5236629545688629, "rewards/reward_fn/std": 0.29252468794584274, "sampling/importance_sampling_ratio/max": 1.2386882305145264, "sampling/importance_sampling_ratio/mean": 0.31699468195438385, "sampling/importance_sampling_ratio/min": 0.00014095268488745205, "sampling/sampling_logp_difference/max": 2.361099898815155, "sampling/sampling_logp_difference/mean": 0.004224814008921385, "step": 980, "step_time": 7.13518309108913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 3000.0, "completions/max_terminated_length": 1803.0, "completions/mean_length": 1436.3645833333333, "completions/mean_terminated_length": 365.7068684895833, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.018403230328112842, "epoch": 0.11899038461538461, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.013908635824918747, "learning_rate": 8.811298076923076e-07, "loss": 0.0017, "num_tokens": 25474749.0, "reward": 0.6092097163200378, "reward_std": 0.3098750760157903, "rewards/reward_fn/mean": 0.6092097163200378, "rewards/reward_fn/std": 0.3098750760157903, "sampling/importance_sampling_ratio/max": 1.295571466286977, "sampling/importance_sampling_ratio/mean": 0.39303799470265705, "sampling/importance_sampling_ratio/min": 0.000429751545501252, "sampling/sampling_logp_difference/max": 2.0793912410736084, "sampling/sampling_logp_difference/mean": 0.0036352280682573714, "step": 990, "step_time": 10.32327980697155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2269.0, "completions/mean_length": 1731.296875, "completions/mean_terminated_length": 539.1580810546875, "completions/min_length": 139.5, "completions/min_terminated_length": 139.5, "entropy": 0.018927588779479265, "epoch": 0.1201923076923077, "frac_reward_zero_std": 0.0, "grad_norm": 0.0024302839301526546, "learning_rate": 8.799278846153845e-07, "loss": -0.0019, "num_tokens": 25666896.0, "reward": 0.6231748461723328, "reward_std": 0.2999221384525299, "rewards/reward_fn/mean": 0.6231748461723328, "rewards/reward_fn/std": 0.2999221235513687, "sampling/importance_sampling_ratio/max": 0.8811006546020508, "sampling/importance_sampling_ratio/mean": 0.26880720257759094, "sampling/importance_sampling_ratio/min": 0.00011944907782890368, "sampling/sampling_logp_difference/max": 3.2188055515289307, "sampling/sampling_logp_difference/mean": 0.004004790214821696, "step": 1000, "step_time": 6.956439364515245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5208333333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 2342.3333333333335, "completions/mean_length": 1824.7916666666667, "completions/mean_terminated_length": 652.3026835123698, "completions/min_length": 157.66666666666666, "completions/min_terminated_length": 157.66666666666666, "entropy": 0.02277037426829338, "epoch": 0.12139423076923077, "frac_reward_zero_std": 0.0, "grad_norm": 0.009981714189052582, "learning_rate": 8.787259615384615e-07, "loss": -0.0041, "num_tokens": 25952092.0, "reward": 0.5703679223855337, "reward_std": 0.27451325456301373, "rewards/reward_fn/mean": 0.5703679223855337, "rewards/reward_fn/std": 0.27451325456301373, "sampling/importance_sampling_ratio/max": 1.273661454518636, "sampling/importance_sampling_ratio/mean": 0.3152107000350952, "sampling/importance_sampling_ratio/min": 8.359627887936465e-05, "sampling/sampling_logp_difference/max": 3.5377695560455322, "sampling/sampling_logp_difference/mean": 0.0048722306576867895, "step": 1010, "step_time": 10.402503939252346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 3000.0, "completions/max_terminated_length": 2229.0, "completions/mean_length": 1756.40625, "completions/mean_terminated_length": 512.8125, "completions/min_length": 119.5, "completions/min_terminated_length": 119.5, "entropy": 0.020954927057027818, "epoch": 0.12259615384615384, "frac_reward_zero_std": 0.125, "grad_norm": 0.008846716023981571, "learning_rate": 8.775240384615384e-07, "loss": 0.0244, "num_tokens": 26135638.0, "reward": 0.6416691839694977, "reward_std": 0.2986660450696945, "rewards/reward_fn/mean": 0.6416691839694977, "rewards/reward_fn/std": 0.29866601526737213, "sampling/importance_sampling_ratio/max": 1.5884857773780823, "sampling/importance_sampling_ratio/mean": 0.32979606091976166, "sampling/importance_sampling_ratio/min": 7.364156772382557e-06, "sampling/sampling_logp_difference/max": 3.213270664215088, "sampling/sampling_logp_difference/mean": 0.004262668779119849, "step": 1020, "step_time": 7.15741326790303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 3000.0, "completions/max_terminated_length": 1609.3333333333333, "completions/mean_length": 1364.3645833333333, "completions/mean_terminated_length": 507.0949198404948, "completions/min_length": 103.33333333333333, "completions/min_terminated_length": 103.33333333333333, "entropy": 0.01822480754926801, "epoch": 0.12379807692307693, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.005991717800498009, "learning_rate": 8.763221153846153e-07, "loss": 0.0004, "num_tokens": 26351761.0, "reward": 0.5535477797190348, "reward_std": 0.3203381697336833, "rewards/reward_fn/mean": 0.5535477797190348, "rewards/reward_fn/std": 0.3203381597995758, "sampling/importance_sampling_ratio/max": 1.2864855925242107, "sampling/importance_sampling_ratio/mean": 0.405123511950175, "sampling/importance_sampling_ratio/min": 0.0001832903605342532, "sampling/sampling_logp_difference/max": 3.83233904838562, "sampling/sampling_logp_difference/mean": 0.003853829111903906, "step": 1030, "step_time": 10.470237208995968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 3000.0, "completions/max_terminated_length": 1408.5, "completions/mean_length": 1886.984375, "completions/mean_terminated_length": 489.0611267089844, "completions/min_length": 163.5, "completions/min_terminated_length": 163.5, "entropy": 0.02109644953161478, "epoch": 0.125, "frac_reward_zero_std": 0.0, "grad_norm": 0.0019366160267964005, "learning_rate": 8.751201923076923e-07, "loss": 0.0016, "num_tokens": 26541104.0, "reward": 0.5704314112663269, "reward_std": 0.26700153201818466, "rewards/reward_fn/mean": 0.5704314112663269, "rewards/reward_fn/std": 0.26700153201818466, "sampling/importance_sampling_ratio/max": 0.9312771558761597, "sampling/importance_sampling_ratio/mean": 0.21614965051412582, "sampling/importance_sampling_ratio/min": 0.00028044644932379015, "sampling/sampling_logp_difference/max": 3.0585416555404663, "sampling/sampling_logp_difference/mean": 0.0038885611575096846, "step": 1040, "step_time": 7.041375279705972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 3000.0, "completions/max_terminated_length": 2168.6666666666665, "completions/mean_length": 2115.1875, "completions/mean_terminated_length": 643.5644124348959, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.02088660206645727, "epoch": 0.12620192307692307, "frac_reward_zero_std": 0.0, "grad_norm": 0.003652440384030342, "learning_rate": 8.739182692307692e-07, "loss": 0.001, "num_tokens": 26839754.0, "reward": 0.5804064671198527, "reward_std": 0.2509392350912094, "rewards/reward_fn/mean": 0.5804064671198527, "rewards/reward_fn/std": 0.2509392350912094, "sampling/importance_sampling_ratio/max": 1.114885965983073, "sampling/importance_sampling_ratio/mean": 0.1780768185853958, "sampling/importance_sampling_ratio/min": 2.3131380961179577e-05, "sampling/sampling_logp_difference/max": 2.8979305823644004, "sampling/sampling_logp_difference/mean": 0.004123560075337688, "step": 1050, "step_time": 10.449254512693733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2454.5, "completions/mean_length": 1600.15625, "completions/mean_terminated_length": 760.2500305175781, "completions/min_length": 139.5, "completions/min_terminated_length": 139.5, "entropy": 0.020413204003125428, "epoch": 0.12740384615384615, "frac_reward_zero_std": 0.0, "grad_norm": 0.0033053613733500242, "learning_rate": 8.727163461538462e-07, "loss": -0.0022, "num_tokens": 27009716.0, "reward": 0.642702728509903, "reward_std": 0.29332374036312103, "rewards/reward_fn/mean": 0.642702728509903, "rewards/reward_fn/std": 0.29332372546195984, "sampling/importance_sampling_ratio/max": 0.9247338771820068, "sampling/importance_sampling_ratio/mean": 0.28104203939437866, "sampling/importance_sampling_ratio/min": 2.4419010628662363e-05, "sampling/sampling_logp_difference/max": 4.882091999053955, "sampling/sampling_logp_difference/mean": 0.004144340637139976, "step": 1060, "step_time": 7.150442614033818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 3000.0, "completions/max_terminated_length": 1617.6666666666667, "completions/mean_length": 1932.71875, "completions/mean_terminated_length": 567.985585530599, "completions/min_length": 208.66666666666666, "completions/min_terminated_length": 208.66666666666666, "entropy": 0.01966061070561409, "epoch": 0.12860576923076922, "frac_reward_zero_std": 0.0, "grad_norm": 0.005625385325402021, "learning_rate": 8.71514423076923e-07, "loss": 0.0027, "num_tokens": 27309361.0, "reward": 0.5664631128311157, "reward_std": 0.30139854550361633, "rewards/reward_fn/mean": 0.5664631128311157, "rewards/reward_fn/std": 0.3013985554377238, "sampling/importance_sampling_ratio/max": 1.4370773235956829, "sampling/importance_sampling_ratio/mean": 0.24743729829788208, "sampling/importance_sampling_ratio/min": 0.00015057955291316225, "sampling/sampling_logp_difference/max": 2.415018916130066, "sampling/sampling_logp_difference/mean": 0.003759862777466575, "step": 1070, "step_time": 10.517333435453475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 3000.0, "completions/max_terminated_length": 2267.5, "completions/mean_length": 1946.484375, "completions/mean_terminated_length": 590.0436096191406, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.021859880536794662, "epoch": 0.12980769230769232, "frac_reward_zero_std": 0.0, "grad_norm": 0.004208546597510576, "learning_rate": 8.703125e-07, "loss": -0.0022, "num_tokens": 27513688.0, "reward": 0.548939436674118, "reward_std": 0.2790210098028183, "rewards/reward_fn/mean": 0.548939436674118, "rewards/reward_fn/std": 0.2790210098028183, "sampling/importance_sampling_ratio/max": 1.8486292362213135, "sampling/importance_sampling_ratio/mean": 0.2491232380270958, "sampling/importance_sampling_ratio/min": 6.405415035715123e-06, "sampling/sampling_logp_difference/max": 3.9979273080825806, "sampling/sampling_logp_difference/mean": 0.004315083380788565, "step": 1080, "step_time": 7.290651701204479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3958333333333333, "completions/max_length": 3000.0, "completions/max_terminated_length": 1831.3333333333333, "completions/mean_length": 1537.53125, "completions/mean_terminated_length": 544.5627237955729, "completions/min_length": 138.66666666666666, "completions/min_terminated_length": 138.66666666666666, "entropy": 0.017338397447019815, "epoch": 0.1310096153846154, "frac_reward_zero_std": 0.0, "grad_norm": 0.01320074126124382, "learning_rate": 8.691105769230769e-07, "loss": 0.0066, "num_tokens": 27748523.0, "reward": 0.657452921072642, "reward_std": 0.2994132836659749, "rewards/reward_fn/mean": 0.657452921072642, "rewards/reward_fn/std": 0.2994132687648137, "sampling/importance_sampling_ratio/max": 1.2119524081548054, "sampling/importance_sampling_ratio/mean": 0.31659050782521564, "sampling/importance_sampling_ratio/min": 6.346364468375516e-05, "sampling/sampling_logp_difference/max": 3.547878384590149, "sampling/sampling_logp_difference/mean": 0.0040395252872258425, "step": 1090, "step_time": 10.2996531650424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 3000.0, "completions/max_terminated_length": 1998.5, "completions/mean_length": 2273.953125, "completions/mean_terminated_length": 708.0208435058594, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "entropy": 0.018801361322402954, "epoch": 0.13221153846153846, "frac_reward_zero_std": 0.0, "grad_norm": 0.002420782810077071, "learning_rate": 8.679086538461539e-07, "loss": -0.0039, "num_tokens": 27969360.0, "reward": 0.5146069824695587, "reward_std": 0.23850685358047485, "rewards/reward_fn/mean": 0.5146069824695587, "rewards/reward_fn/std": 0.23850686103105545, "sampling/importance_sampling_ratio/max": 1.2459397315979004, "sampling/importance_sampling_ratio/mean": 0.17138638347387314, "sampling/importance_sampling_ratio/min": 9.396316727361409e-06, "sampling/sampling_logp_difference/max": 3.6571473479270935, "sampling/sampling_logp_difference/mean": 0.003657846711575985, "step": 1100, "step_time": 7.122907925583422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5104166666666666, "completions/max_length": 3000.0, "completions/max_terminated_length": 1673.0, "completions/mean_length": 1807.7604166666667, "completions/mean_terminated_length": 566.2168782552084, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.021332334727048874, "epoch": 0.13341346153846154, "frac_reward_zero_std": 0.0, "grad_norm": 0.02433343045413494, "learning_rate": 8.667067307692307e-07, "loss": 0.009, "num_tokens": 28248801.0, "reward": 0.631587286790212, "reward_std": 0.257819135983785, "rewards/reward_fn/mean": 0.631587286790212, "rewards/reward_fn/std": 0.25781914591789246, "sampling/importance_sampling_ratio/max": 1.2082817554473877, "sampling/importance_sampling_ratio/mean": 0.23083489388227463, "sampling/importance_sampling_ratio/min": 5.7706371080712415e-05, "sampling/sampling_logp_difference/max": 3.227118730545044, "sampling/sampling_logp_difference/mean": 0.004580069954196612, "step": 1110, "step_time": 10.255625561159103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 3000.0, "completions/max_terminated_length": 1685.0, "completions/mean_length": 1512.65625, "completions/mean_terminated_length": 571.8449249267578, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.023926609382033347, "epoch": 0.1346153846153846, "frac_reward_zero_std": 0.0, "grad_norm": 0.006183684803545475, "learning_rate": 8.655048076923076e-07, "loss": -0.0028, "num_tokens": 28421211.0, "reward": 0.6939028203487396, "reward_std": 0.26718906313180923, "rewards/reward_fn/mean": 0.6939028203487396, "rewards/reward_fn/std": 0.26718906313180923, "sampling/importance_sampling_ratio/max": 1.8280489444732666, "sampling/importance_sampling_ratio/mean": 0.29475389420986176, "sampling/importance_sampling_ratio/min": 0.0002082701430481393, "sampling/sampling_logp_difference/max": 3.0991536378860474, "sampling/sampling_logp_difference/mean": 0.0044942633248865604, "step": 1120, "step_time": 7.127964046038687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5208333333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 2296.6666666666665, "completions/mean_length": 1767.5208333333333, "completions/mean_terminated_length": 437.7875264485677, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.019647092558443546, "epoch": 0.13581730769230768, "frac_reward_zero_std": 0.0, "grad_norm": 0.002140216762199998, "learning_rate": 8.643028846153846e-07, "loss": 0.0027, "num_tokens": 28724181.0, "reward": 0.5952535072962443, "reward_std": 0.2693930963675181, "rewards/reward_fn/mean": 0.5952535072962443, "rewards/reward_fn/std": 0.26939308643341064, "sampling/importance_sampling_ratio/max": 1.3618757724761963, "sampling/importance_sampling_ratio/mean": 0.2990986704826355, "sampling/importance_sampling_ratio/min": 0.000559878972126171, "sampling/sampling_logp_difference/max": 2.2668572664260864, "sampling/sampling_logp_difference/mean": 0.003675971024980148, "step": 1130, "step_time": 10.539180643297732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 3000.0, "completions/max_terminated_length": 1237.5, "completions/mean_length": 1597.484375, "completions/mean_terminated_length": 372.34471130371094, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.02274622842669487, "epoch": 0.13701923076923078, "frac_reward_zero_std": 0.0, "grad_norm": 0.005478045903146267, "learning_rate": 8.631009615384614e-07, "loss": -0.0013, "num_tokens": 28899844.0, "reward": 0.5935328602790833, "reward_std": 0.2841835767030716, "rewards/reward_fn/mean": 0.5935328602790833, "rewards/reward_fn/std": 0.2841835618019104, "sampling/importance_sampling_ratio/max": 1.502492755651474, "sampling/importance_sampling_ratio/mean": 0.31256621330976486, "sampling/importance_sampling_ratio/min": 5.682079972757492e-05, "sampling/sampling_logp_difference/max": 2.6528786420822144, "sampling/sampling_logp_difference/mean": 0.005419321358203888, "step": 1140, "step_time": 7.03784365337342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 3000.0, "completions/max_terminated_length": 1362.6666666666667, "completions/mean_length": 1893.6666666666667, "completions/mean_terminated_length": 464.4285074869792, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.021784367971122265, "epoch": 0.13822115384615385, "frac_reward_zero_std": 0.0, "grad_norm": 0.0007718852139078081, "learning_rate": 8.618990384615385e-07, "loss": 0.0042, "num_tokens": 29202820.0, "reward": 0.5603048205375671, "reward_std": 0.2738872766494751, "rewards/reward_fn/mean": 0.5603048205375671, "rewards/reward_fn/std": 0.2738872766494751, "sampling/importance_sampling_ratio/max": 1.0561025937398274, "sampling/importance_sampling_ratio/mean": 0.19586566338936487, "sampling/importance_sampling_ratio/min": 1.3401521191410323e-05, "sampling/sampling_logp_difference/max": 9.482982238133749, "sampling/sampling_logp_difference/mean": 0.004385161524017652, "step": 1150, "step_time": 10.57040879484266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.546875, "completions/max_length": 3000.0, "completions/max_terminated_length": 2068.0, "completions/mean_length": 1983.578125, "completions/mean_terminated_length": 758.9411926269531, "completions/min_length": 211.5, "completions/min_terminated_length": 211.5, "entropy": 0.02045146021991968, "epoch": 0.13942307692307693, "frac_reward_zero_std": 0.0, "grad_norm": 0.003486152272671461, "learning_rate": 8.606971153846153e-07, "loss": -0.0038, "num_tokens": 29409785.0, "reward": 0.587051123380661, "reward_std": 0.24719683825969696, "rewards/reward_fn/mean": 0.587051123380661, "rewards/reward_fn/std": 0.24719683825969696, "sampling/importance_sampling_ratio/max": 1.4089787006378174, "sampling/importance_sampling_ratio/mean": 0.20075182616710663, "sampling/importance_sampling_ratio/min": 8.868973236531019e-05, "sampling/sampling_logp_difference/max": 2.286481499671936, "sampling/sampling_logp_difference/mean": 0.004754116991534829, "step": 1160, "step_time": 7.263412564992905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3020833333333333, "completions/max_length": 3000.0, "completions/max_terminated_length": 2461.0, "completions/mean_length": 1401.5729166666667, "completions/mean_terminated_length": 710.997314453125, "completions/min_length": 180.33333333333334, "completions/min_terminated_length": 180.33333333333334, "entropy": 0.02094292249530554, "epoch": 0.140625, "frac_reward_zero_std": 0.0, "grad_norm": 0.004305543377995491, "learning_rate": 8.594951923076923e-07, "loss": -0.0083, "num_tokens": 29643896.0, "reward": 0.6563338836034139, "reward_std": 0.27401378750801086, "rewards/reward_fn/mean": 0.6563338836034139, "rewards/reward_fn/std": 0.27401378254095715, "sampling/importance_sampling_ratio/max": 1.8707877000172932, "sampling/importance_sampling_ratio/mean": 0.3575558563073476, "sampling/importance_sampling_ratio/min": 0.00034951759638109553, "sampling/sampling_logp_difference/max": 2.146899461746216, "sampling/sampling_logp_difference/mean": 0.004625060285131137, "step": 1170, "step_time": 10.458020574972034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 3000.0, "completions/max_terminated_length": 2431.0, "completions/mean_length": 1605.78125, "completions/mean_terminated_length": 716.4102783203125, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.02174672605469823, "epoch": 0.14182692307692307, "frac_reward_zero_std": 0.0, "grad_norm": 0.008900653570890427, "learning_rate": 8.582932692307692e-07, "loss": -0.0029, "num_tokens": 29820898.0, "reward": 0.6339293718338013, "reward_std": 0.2855421304702759, "rewards/reward_fn/mean": 0.6339293718338013, "rewards/reward_fn/std": 0.2855421453714371, "sampling/importance_sampling_ratio/max": 1.2286553084850311, "sampling/importance_sampling_ratio/mean": 0.33184167742729187, "sampling/importance_sampling_ratio/min": 0.0003118448512395844, "sampling/sampling_logp_difference/max": 1.4997108578681946, "sampling/sampling_logp_difference/mean": 0.0045574543764814734, "step": 1180, "step_time": 7.057056932989508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4583333333333333, "completions/max_length": 3000.0, "completions/max_terminated_length": 2440.3333333333335, "completions/mean_length": 1740.0833333333333, "completions/mean_terminated_length": 667.3547566731771, "completions/min_length": 139.33333333333334, "completions/min_terminated_length": 139.33333333333334, "entropy": 0.02103317454457283, "epoch": 0.14302884615384615, "frac_reward_zero_std": 0.0, "grad_norm": 0.002817571396008134, "learning_rate": 8.570913461538461e-07, "loss": 0.0015, "num_tokens": 30086474.0, "reward": 0.655825138092041, "reward_std": 0.28227150936921436, "rewards/reward_fn/mean": 0.655825138092041, "rewards/reward_fn/std": 0.28227150440216064, "sampling/importance_sampling_ratio/max": 1.3102050026257832, "sampling/importance_sampling_ratio/mean": 0.28629302481810254, "sampling/importance_sampling_ratio/min": 3.167330836125378e-05, "sampling/sampling_logp_difference/max": 3.5291585127512612, "sampling/sampling_logp_difference/mean": 0.003904396202415228, "step": 1190, "step_time": 10.457742864638567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.65625, "completions/max_length": 3000.0, "completions/max_terminated_length": 1223.0, "completions/mean_length": 2147.578125, "completions/mean_terminated_length": 505.9285888671875, "completions/min_length": 151.5, "completions/min_terminated_length": 151.5, "entropy": 0.02067391499876976, "epoch": 0.14423076923076922, "frac_reward_zero_std": 0.0, "grad_norm": 0.006047375034540892, "learning_rate": 8.55889423076923e-07, "loss": -0.0017, "num_tokens": 30293879.0, "reward": 0.5705578625202179, "reward_std": 0.24046512693166733, "rewards/reward_fn/mean": 0.5705578625202179, "rewards/reward_fn/std": 0.24046513438224792, "sampling/importance_sampling_ratio/max": 1.4161732196807861, "sampling/importance_sampling_ratio/mean": 0.17898795753717422, "sampling/importance_sampling_ratio/min": 9.06981100001758e-06, "sampling/sampling_logp_difference/max": 1.6436550617218018, "sampling/sampling_logp_difference/mean": 0.004371656337752938, "step": 1200, "step_time": 7.174395804572851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5416666666666666, "completions/max_length": 3000.0, "completions/max_terminated_length": 2164.0, "completions/mean_length": 1985.0104166666667, "completions/mean_terminated_length": 830.4814860026041, "completions/min_length": 160.33333333333334, "completions/min_terminated_length": 160.33333333333334, "entropy": 0.01992900259792805, "epoch": 0.14543269230769232, "frac_reward_zero_std": 0.0, "grad_norm": 0.003974648658186197, "learning_rate": 8.546875e-07, "loss": 0.002, "num_tokens": 30567200.0, "reward": 0.5794747173786163, "reward_std": 0.2944018840789795, "rewards/reward_fn/mean": 0.5794747173786163, "rewards/reward_fn/std": 0.2944018840789795, "sampling/importance_sampling_ratio/max": 1.102483332157135, "sampling/importance_sampling_ratio/mean": 0.21134507656097412, "sampling/importance_sampling_ratio/min": 4.440556343373222e-05, "sampling/sampling_logp_difference/max": 4.428368091583252, "sampling/sampling_logp_difference/mean": 0.004311837876836459, "step": 1210, "step_time": 10.346544927358627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.578125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2400.5, "completions/mean_length": 2100.296875, "completions/mean_terminated_length": 862.629150390625, "completions/min_length": 281.5, "completions/min_terminated_length": 281.5, "entropy": 0.020994905568659305, "epoch": 0.1466346153846154, "frac_reward_zero_std": 0.0, "grad_norm": 0.002429105807095766, "learning_rate": 8.534855769230769e-07, "loss": -0.004, "num_tokens": 30777179.0, "reward": 0.6178385317325592, "reward_std": 0.2577524185180664, "rewards/reward_fn/mean": 0.6178385317325592, "rewards/reward_fn/std": 0.2577524110674858, "sampling/importance_sampling_ratio/max": 1.6040958762168884, "sampling/importance_sampling_ratio/mean": 0.1476653516292572, "sampling/importance_sampling_ratio/min": 4.758899740409106e-05, "sampling/sampling_logp_difference/max": 3.2554149627685547, "sampling/sampling_logp_difference/mean": 0.004463996971026063, "step": 1220, "step_time": 7.187850875873119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3645833333333333, "completions/max_length": 3000.0, "completions/max_terminated_length": 1794.3333333333333, "completions/mean_length": 1473.5104166666667, "completions/mean_terminated_length": 575.9410095214844, "completions/min_length": 170.66666666666666, "completions/min_terminated_length": 170.66666666666666, "entropy": 0.023866303637623786, "epoch": 0.14783653846153846, "frac_reward_zero_std": 0.0, "grad_norm": 0.006791263353079557, "learning_rate": 8.522836538461539e-07, "loss": -0.0049, "num_tokens": 31013916.0, "reward": 0.7003965775171915, "reward_std": 0.24659786621729532, "rewards/reward_fn/mean": 0.7003965775171915, "rewards/reward_fn/std": 0.2465978612502416, "sampling/importance_sampling_ratio/max": 1.4802163044611614, "sampling/importance_sampling_ratio/mean": 0.2649172246456146, "sampling/importance_sampling_ratio/min": 3.506972158599334e-05, "sampling/sampling_logp_difference/max": 2.697286049524943, "sampling/sampling_logp_difference/mean": 0.004931086984773477, "step": 1230, "step_time": 10.482457349356263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2481.0, "completions/mean_length": 1881.078125, "completions/mean_terminated_length": 602.2918701171875, "completions/min_length": 117.5, "completions/min_terminated_length": 117.5, "entropy": 0.020715872943401336, "epoch": 0.14903846153846154, "frac_reward_zero_std": 0.0, "grad_norm": 0.0020606990437954664, "learning_rate": 8.510817307692307e-07, "loss": -0.0031, "num_tokens": 31203457.0, "reward": 0.4268442690372467, "reward_std": 0.29985618591308594, "rewards/reward_fn/mean": 0.4268442690372467, "rewards/reward_fn/std": 0.29985615611076355, "sampling/importance_sampling_ratio/max": 1.0320657789707184, "sampling/importance_sampling_ratio/mean": 0.2647218853235245, "sampling/importance_sampling_ratio/min": 3.890213065460557e-05, "sampling/sampling_logp_difference/max": 4.591010212898254, "sampling/sampling_logp_difference/mean": 0.0038145948201417923, "step": 1240, "step_time": 7.063073081057519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3541666666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 1417.3333333333333, "completions/mean_length": 1288.7291666666667, "completions/mean_terminated_length": 363.45094299316406, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.021300500817596914, "epoch": 0.1502403846153846, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.016093697398900986, "learning_rate": 8.498798076923076e-07, "loss": -0.0017, "num_tokens": 31431111.0, "reward": 0.5884052614370981, "reward_std": 0.2793298810720444, "rewards/reward_fn/mean": 0.5884052614370981, "rewards/reward_fn/std": 0.27932989100615185, "sampling/importance_sampling_ratio/max": 1.4846928517023723, "sampling/importance_sampling_ratio/mean": 0.4453837374846141, "sampling/importance_sampling_ratio/min": 0.00017988853505812585, "sampling/sampling_logp_difference/max": 2.0953648487726846, "sampling/sampling_logp_difference/mean": 0.0044569410383701324, "step": 1250, "step_time": 10.496338490676134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 3000.0, "completions/max_terminated_length": 1714.0, "completions/mean_length": 1686.90625, "completions/mean_terminated_length": 755.6761474609375, "completions/min_length": 195.5, "completions/min_terminated_length": 195.5, "entropy": 0.02249424420297146, "epoch": 0.15144230769230768, "frac_reward_zero_std": 0.0, "grad_norm": 0.00470451544970274, "learning_rate": 8.486778846153846e-07, "loss": -0.0014, "num_tokens": 31616905.0, "reward": 0.6868482828140259, "reward_std": 0.24931537359952927, "rewards/reward_fn/mean": 0.6868482828140259, "rewards/reward_fn/std": 0.24931538850069046, "sampling/importance_sampling_ratio/max": 1.3751485347747803, "sampling/importance_sampling_ratio/mean": 0.2709423378109932, "sampling/importance_sampling_ratio/min": 1.7814586499298457e-06, "sampling/sampling_logp_difference/max": 3.9924397468566895, "sampling/sampling_logp_difference/mean": 0.004646357148885727, "step": 1260, "step_time": 7.09806506568566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 3000.0, "completions/max_terminated_length": 2035.6666666666667, "completions/mean_length": 1770.8125, "completions/mean_terminated_length": 682.9558715820312, "completions/min_length": 148.33333333333334, "completions/min_terminated_length": 148.33333333333334, "entropy": 0.025174227356910706, "epoch": 0.15264423076923078, "frac_reward_zero_std": 0.0, "grad_norm": 0.0166124626994133, "learning_rate": 8.474759615384615e-07, "loss": -0.0009, "num_tokens": 31910663.0, "reward": 0.6340989271799723, "reward_std": 0.2722832163174947, "rewards/reward_fn/mean": 0.6340989271799723, "rewards/reward_fn/std": 0.2722832262516022, "sampling/importance_sampling_ratio/max": 1.3712806701660156, "sampling/importance_sampling_ratio/mean": 0.19201173384984335, "sampling/importance_sampling_ratio/min": 8.494347078643234e-05, "sampling/sampling_logp_difference/max": 3.340550343195597, "sampling/sampling_logp_difference/mean": 0.00514468007410566, "step": 1270, "step_time": 10.455771159287542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 3000.0, "completions/max_terminated_length": 2106.5, "completions/mean_length": 2018.046875, "completions/mean_terminated_length": 792.4271240234375, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.023011322133243084, "epoch": 0.15384615384615385, "frac_reward_zero_std": 0.0, "grad_norm": 0.0048550814390182495, "learning_rate": 8.462740384615384e-07, "loss": -0.0028, "num_tokens": 32113858.0, "reward": 0.5959661304950714, "reward_std": 0.3001696467399597, "rewards/reward_fn/mean": 0.5959661304950714, "rewards/reward_fn/std": 0.3001696467399597, "sampling/importance_sampling_ratio/max": 1.502270519733429, "sampling/importance_sampling_ratio/mean": 0.2382388561964035, "sampling/importance_sampling_ratio/min": 2.572633275121916e-05, "sampling/sampling_logp_difference/max": 2.175555944442749, "sampling/sampling_logp_difference/mean": 0.004775156266987324, "step": 1280, "step_time": 7.121258775796742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 3000.0, "completions/max_terminated_length": 1961.0, "completions/mean_length": 1689.6875, "completions/mean_terminated_length": 535.0108235677084, "completions/min_length": 133.33333333333334, "completions/min_terminated_length": 133.33333333333334, "entropy": 0.01965275900438428, "epoch": 0.15504807692307693, "frac_reward_zero_std": 0.0, "grad_norm": 0.007970191538333893, "learning_rate": 8.450721153846154e-07, "loss": 0.0035, "num_tokens": 32375332.0, "reward": 0.6071279048919678, "reward_std": 0.2750819871822993, "rewards/reward_fn/mean": 0.6071279048919678, "rewards/reward_fn/std": 0.27508198221524555, "sampling/importance_sampling_ratio/max": 1.2696988185246785, "sampling/importance_sampling_ratio/mean": 0.2955208122730255, "sampling/importance_sampling_ratio/min": 8.670945726407808e-05, "sampling/sampling_logp_difference/max": 2.7350133260091147, "sampling/sampling_logp_difference/mean": 0.004634261441727479, "step": 1290, "step_time": 10.403573467489332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 3000.0, "completions/max_terminated_length": 1872.0, "completions/mean_length": 1329.90625, "completions/mean_terminated_length": 416.70532989501953, "completions/min_length": 100.5, "completions/min_terminated_length": 100.5, "entropy": 0.020060095377266406, "epoch": 0.15625, "frac_reward_zero_std": 0.125, "grad_norm": 0.0033052810467779636, "learning_rate": 8.438701923076923e-07, "loss": 0.0016, "num_tokens": 32519934.0, "reward": 0.6228694021701813, "reward_std": 0.31474781036376953, "rewards/reward_fn/mean": 0.6228694021701813, "rewards/reward_fn/std": 0.3147478252649307, "sampling/importance_sampling_ratio/max": 1.6600127816200256, "sampling/importance_sampling_ratio/mean": 0.4704507887363434, "sampling/importance_sampling_ratio/min": 4.914865712635219e-05, "sampling/sampling_logp_difference/max": 2.409143805503845, "sampling/sampling_logp_difference/mean": 0.003981616115197539, "step": 1300, "step_time": 7.067065815534443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5833333333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 1639.0, "completions/mean_length": 2008.6354166666667, "completions/mean_terminated_length": 686.2583719889323, "completions/min_length": 136.66666666666666, "completions/min_terminated_length": 136.66666666666666, "entropy": 0.019661205261945723, "epoch": 0.15745192307692307, "frac_reward_zero_std": 0.0, "grad_norm": 0.007276620250195265, "learning_rate": 8.426682692307691e-07, "loss": 0.0005, "num_tokens": 32831643.0, "reward": 0.5900590618451437, "reward_std": 0.2763031820456187, "rewards/reward_fn/mean": 0.5900590618451437, "rewards/reward_fn/std": 0.2763031820456187, "sampling/importance_sampling_ratio/max": 1.3826154867808025, "sampling/importance_sampling_ratio/mean": 0.21692973375320435, "sampling/importance_sampling_ratio/min": 3.4055985755306516e-05, "sampling/sampling_logp_difference/max": 4.934376358985901, "sampling/sampling_logp_difference/mean": 0.004091768835981687, "step": 1310, "step_time": 10.453550106380135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2228.5, "completions/mean_length": 1709.65625, "completions/mean_terminated_length": 705.6458435058594, "completions/min_length": 164.5, "completions/min_terminated_length": 164.5, "entropy": 0.02187402956187725, "epoch": 0.15865384615384615, "frac_reward_zero_std": 0.0, "grad_norm": 0.0010451622074469924, "learning_rate": 8.414663461538462e-07, "loss": 0.001, "num_tokens": 33008325.0, "reward": 0.6210663318634033, "reward_std": 0.27221207320690155, "rewards/reward_fn/mean": 0.6210663318634033, "rewards/reward_fn/std": 0.27221207320690155, "sampling/importance_sampling_ratio/max": 1.1746456325054169, "sampling/importance_sampling_ratio/mean": 0.20078734681010246, "sampling/importance_sampling_ratio/min": 0.00014998351343820104, "sampling/sampling_logp_difference/max": 3.8606733083724976, "sampling/sampling_logp_difference/mean": 0.004743166267871857, "step": 1320, "step_time": 7.097293344698846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3229166666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 2231.0, "completions/mean_length": 1370.3645833333333, "completions/mean_terminated_length": 593.107666015625, "completions/min_length": 158.33333333333334, "completions/min_terminated_length": 158.33333333333334, "entropy": 0.02324679046869278, "epoch": 0.15985576923076922, "frac_reward_zero_std": 0.0, "grad_norm": 0.007701457012444735, "learning_rate": 8.40264423076923e-07, "loss": -0.0004, "num_tokens": 33257304.0, "reward": 0.7145009239514669, "reward_std": 0.2782009045283, "rewards/reward_fn/mean": 0.7145009239514669, "rewards/reward_fn/std": 0.2782009094953537, "sampling/importance_sampling_ratio/max": 1.2666982412338257, "sampling/importance_sampling_ratio/mean": 0.27457969387372333, "sampling/importance_sampling_ratio/min": 0.00023002012070113173, "sampling/sampling_logp_difference/max": 2.8397083282470703, "sampling/sampling_logp_difference/mean": 0.005122587239990632, "step": 1330, "step_time": 10.326984881423414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 3000.0, "completions/max_terminated_length": 1504.5, "completions/mean_length": 1171.234375, "completions/mean_terminated_length": 526.8854675292969, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.018949515745043756, "epoch": 0.16105769230769232, "frac_reward_zero_std": 0.125, "grad_norm": 0.014599017798900604, "learning_rate": 8.390625e-07, "loss": 0.0119, "num_tokens": 33403695.0, "reward": 0.5884002447128296, "reward_std": 0.33791089057922363, "rewards/reward_fn/mean": 0.5884002447128296, "rewards/reward_fn/std": 0.33791089057922363, "sampling/importance_sampling_ratio/max": 1.816356748342514, "sampling/importance_sampling_ratio/mean": 0.4598829001188278, "sampling/importance_sampling_ratio/min": 0.0004295945000194479, "sampling/sampling_logp_difference/max": 1.7713277339935303, "sampling/sampling_logp_difference/mean": 0.003931600134819746, "step": 1340, "step_time": 6.88840487441048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4479166666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 1610.6666666666667, "completions/mean_length": 1569.4375, "completions/mean_terminated_length": 408.3737386067708, "completions/min_length": 101.66666666666667, "completions/min_terminated_length": 101.66666666666667, "entropy": 0.016116989869624378, "epoch": 0.1622596153846154, "frac_reward_zero_std": 0.16666666666666666, "grad_norm": 0.005813992582261562, "learning_rate": 8.378605769230769e-07, "loss": -0.0061, "num_tokens": 33659009.0, "reward": 0.5195292631785074, "reward_std": 0.32983019948005676, "rewards/reward_fn/mean": 0.5195292631785074, "rewards/reward_fn/std": 0.32983020941416424, "sampling/importance_sampling_ratio/max": 1.220324695110321, "sampling/importance_sampling_ratio/mean": 0.3912711938222249, "sampling/importance_sampling_ratio/min": 0.00015191672173386905, "sampling/sampling_logp_difference/max": 2.2951486110687256, "sampling/sampling_logp_difference/mean": 0.0034597011593480906, "step": 1350, "step_time": 10.271078008133918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3000.0, "completions/max_terminated_length": 1549.5, "completions/mean_length": 1089.46875, "completions/mean_terminated_length": 452.4928741455078, "completions/min_length": 122.5, "completions/min_terminated_length": 122.5, "entropy": 0.02486976087093353, "epoch": 0.16346153846153846, "frac_reward_zero_std": 0.0, "grad_norm": 0.009166580624878407, "learning_rate": 8.366586538461538e-07, "loss": -0.0036, "num_tokens": 33799927.0, "reward": 0.7567602694034576, "reward_std": 0.2489125356078148, "rewards/reward_fn/mean": 0.7567602694034576, "rewards/reward_fn/std": 0.2489125281572342, "sampling/importance_sampling_ratio/max": 1.4027708768844604, "sampling/importance_sampling_ratio/mean": 0.3415788263082504, "sampling/importance_sampling_ratio/min": 0.00010061307102660066, "sampling/sampling_logp_difference/max": 2.606516480445862, "sampling/sampling_logp_difference/mean": 0.005452593322843313, "step": 1360, "step_time": 7.080280099343509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4791666666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 1293.0, "completions/mean_length": 1691.9375, "completions/mean_terminated_length": 499.1619364420573, "completions/min_length": 150.66666666666666, "completions/min_terminated_length": 150.66666666666666, "entropy": 0.022606744058430196, "epoch": 0.16466346153846154, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.006677900440990925, "learning_rate": 8.354567307692307e-07, "loss": -0.0035, "num_tokens": 34077729.0, "reward": 0.6167490482330322, "reward_std": 0.3221948246161143, "rewards/reward_fn/mean": 0.6167490482330322, "rewards/reward_fn/std": 0.3221948246161143, "sampling/importance_sampling_ratio/max": 1.1601065794626872, "sampling/importance_sampling_ratio/mean": 0.2559601664543152, "sampling/importance_sampling_ratio/min": 5.324605449648819e-05, "sampling/sampling_logp_difference/max": 6.120669444402059, "sampling/sampling_logp_difference/mean": 0.004701889818534255, "step": 1370, "step_time": 10.480983305163681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.546875, "completions/max_length": 3000.0, "completions/max_terminated_length": 1866.0, "completions/mean_length": 1881.140625, "completions/mean_terminated_length": 532.0528869628906, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.022522210516035555, "epoch": 0.1658653846153846, "frac_reward_zero_std": 0.0, "grad_norm": 0.0022283110301941633, "learning_rate": 8.342548076923076e-07, "loss": -0.0021, "num_tokens": 34263546.0, "reward": 0.6101175546646118, "reward_std": 0.2907613664865494, "rewards/reward_fn/mean": 0.6101175546646118, "rewards/reward_fn/std": 0.29076137393713, "sampling/importance_sampling_ratio/max": 1.023892194032669, "sampling/importance_sampling_ratio/mean": 0.230641707777977, "sampling/importance_sampling_ratio/min": 7.855811645640642e-06, "sampling/sampling_logp_difference/max": 2.956837296485901, "sampling/sampling_logp_difference/mean": 0.004441728349775076, "step": 1380, "step_time": 7.186302416678518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 3000.0, "completions/max_terminated_length": 1917.3333333333333, "completions/mean_length": 1741.5833333333333, "completions/mean_terminated_length": 649.8935546875, "completions/min_length": 177.33333333333334, "completions/min_terminated_length": 177.33333333333334, "entropy": 0.020885018911212683, "epoch": 0.16706730769230768, "frac_reward_zero_std": 0.0, "grad_norm": 0.019621707499027252, "learning_rate": 8.330528846153846e-07, "loss": 0.006, "num_tokens": 34540634.0, "reward": 0.6402950684229533, "reward_std": 0.2507072190443675, "rewards/reward_fn/mean": 0.6402950684229533, "rewards/reward_fn/std": 0.2507072190443675, "sampling/importance_sampling_ratio/max": 1.3005338112513225, "sampling/importance_sampling_ratio/mean": 0.2919175922870636, "sampling/importance_sampling_ratio/min": 2.2492751099889574e-05, "sampling/sampling_logp_difference/max": 2.8657581011454263, "sampling/sampling_logp_difference/mean": 0.004545029407987992, "step": 1390, "step_time": 10.286894747242332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.515625, "completions/max_length": 3000.0, "completions/max_terminated_length": 1983.5, "completions/mean_length": 1793.921875, "completions/mean_terminated_length": 501.61402893066406, "completions/min_length": 125.5, "completions/min_terminated_length": 125.5, "entropy": 0.02322037797421217, "epoch": 0.16826923076923078, "frac_reward_zero_std": 0.0, "grad_norm": 0.016231730580329895, "learning_rate": 8.318509615384615e-07, "loss": -0.001, "num_tokens": 34732173.0, "reward": 0.6259602904319763, "reward_std": 0.2881501019001007, "rewards/reward_fn/mean": 0.6259602904319763, "rewards/reward_fn/std": 0.2881501019001007, "sampling/importance_sampling_ratio/max": 1.556132972240448, "sampling/importance_sampling_ratio/mean": 0.2172999531030655, "sampling/importance_sampling_ratio/min": 4.7282254058700346e-05, "sampling/sampling_logp_difference/max": 3.791698694229126, "sampling/sampling_logp_difference/mean": 0.004521505208685994, "step": 1400, "step_time": 7.1878642124123875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 3000.0, "completions/max_terminated_length": 2603.0, "completions/mean_length": 1853.9895833333333, "completions/mean_terminated_length": 726.3951009114584, "completions/min_length": 154.33333333333334, "completions/min_terminated_length": 154.33333333333334, "entropy": 0.02275430876761675, "epoch": 0.16947115384615385, "frac_reward_zero_std": 0.0, "grad_norm": 0.004962276667356491, "learning_rate": 8.306490384615385e-07, "loss": -0.0015, "num_tokens": 35012268.0, "reward": 0.5553853511810303, "reward_std": 0.29245852927366894, "rewards/reward_fn/mean": 0.5553853511810303, "rewards/reward_fn/std": 0.29245852927366894, "sampling/importance_sampling_ratio/max": 1.3409303029378254, "sampling/importance_sampling_ratio/mean": 0.19621065258979797, "sampling/importance_sampling_ratio/min": 2.034181428219502e-05, "sampling/sampling_logp_difference/max": 3.815922975540161, "sampling/sampling_logp_difference/mean": 0.0046753462714453535, "step": 1410, "step_time": 10.459869264438748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 3000.0, "completions/max_terminated_length": 1894.0, "completions/mean_length": 1556.71875, "completions/mean_terminated_length": 502.5130081176758, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.01573435729369521, "epoch": 0.17067307692307693, "frac_reward_zero_std": 0.125, "grad_norm": 0.0027201788034290075, "learning_rate": 8.294471153846153e-07, "loss": 0.0114, "num_tokens": 35187066.0, "reward": 0.39313554763793945, "reward_std": 0.33733808994293213, "rewards/reward_fn/mean": 0.39313554763793945, "rewards/reward_fn/std": 0.33733807504177094, "sampling/importance_sampling_ratio/max": 1.3253125548362732, "sampling/importance_sampling_ratio/mean": 0.3555862531065941, "sampling/importance_sampling_ratio/min": 9.88124907053134e-05, "sampling/sampling_logp_difference/max": 6.176225304603577, "sampling/sampling_logp_difference/mean": 0.003838788950815797, "step": 1420, "step_time": 7.273680136539042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4583333333333333, "completions/max_length": 3000.0, "completions/max_terminated_length": 1928.0, "completions/mean_length": 1691.8020833333333, "completions/mean_terminated_length": 588.5021769205729, "completions/min_length": 154.66666666666666, "completions/min_terminated_length": 154.66666666666666, "entropy": 0.020692226476967335, "epoch": 0.171875, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.009500250220298767, "learning_rate": 8.282451923076923e-07, "loss": -0.0032, "num_tokens": 35446455.0, "reward": 0.4674461881319682, "reward_std": 0.3292344609896342, "rewards/reward_fn/mean": 0.4674461881319682, "rewards/reward_fn/std": 0.32923445105552673, "sampling/importance_sampling_ratio/max": 1.3137840429941814, "sampling/importance_sampling_ratio/mean": 0.3022607664267222, "sampling/importance_sampling_ratio/min": 5.767879671717916e-05, "sampling/sampling_logp_difference/max": 3.586015542348226, "sampling/sampling_logp_difference/mean": 0.004064937277386586, "step": 1430, "step_time": 10.469162499904632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 3000.0, "completions/max_terminated_length": 1580.5, "completions/mean_length": 1197.640625, "completions/mean_terminated_length": 318.2489318847656, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.017679289542138578, "epoch": 0.17307692307692307, "frac_reward_zero_std": 0.125, "grad_norm": 0.009024336002767086, "learning_rate": 8.270432692307692e-07, "loss": -0.015, "num_tokens": 35596832.0, "reward": 0.6765373349189758, "reward_std": 0.2784782722592354, "rewards/reward_fn/mean": 0.6765373349189758, "rewards/reward_fn/std": 0.2784782722592354, "sampling/importance_sampling_ratio/max": 1.5182456374168396, "sampling/importance_sampling_ratio/mean": 0.5427403450012207, "sampling/importance_sampling_ratio/min": 6.42512950435048e-05, "sampling/sampling_logp_difference/max": 3.848830461502075, "sampling/sampling_logp_difference/mean": 0.003795555792748928, "step": 1440, "step_time": 6.983411037176848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3541666666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 897.6666666666666, "completions/mean_length": 1232.5625, "completions/mean_terminated_length": 257.2918955485026, "completions/min_length": 115.33333333333333, "completions/min_terminated_length": 115.33333333333333, "entropy": 0.01741932863369584, "epoch": 0.17427884615384615, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.002411284251138568, "learning_rate": 8.258413461538461e-07, "loss": 0.0025, "num_tokens": 35824086.0, "reward": 0.6181753079096476, "reward_std": 0.30296502510706586, "rewards/reward_fn/mean": 0.6181753079096476, "rewards/reward_fn/std": 0.3029650350411733, "sampling/importance_sampling_ratio/max": 1.5078622897466023, "sampling/importance_sampling_ratio/mean": 0.4880000551541646, "sampling/importance_sampling_ratio/min": 1.4165377857959053e-05, "sampling/sampling_logp_difference/max": 2.4987725416819253, "sampling/sampling_logp_difference/mean": 0.0038593773109217486, "step": 1450, "step_time": 10.379560447856784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 3000.0, "completions/max_terminated_length": 1041.0, "completions/mean_length": 1041.734375, "completions/mean_terminated_length": 275.45652770996094, "completions/min_length": 105.5, "completions/min_terminated_length": 105.5, "entropy": 0.020795615576207637, "epoch": 0.17548076923076922, "frac_reward_zero_std": 0.0, "grad_norm": 0.0046553113497793674, "learning_rate": 8.246394230769231e-07, "loss": -0.002, "num_tokens": 35953613.0, "reward": 0.6633073687553406, "reward_std": 0.30293525755405426, "rewards/reward_fn/mean": 0.6633073687553406, "rewards/reward_fn/std": 0.30293527245521545, "sampling/importance_sampling_ratio/max": 1.7489829063415527, "sampling/importance_sampling_ratio/mean": 0.47400490939617157, "sampling/importance_sampling_ratio/min": 0.00031544714875053614, "sampling/sampling_logp_difference/max": 1.5502816438674927, "sampling/sampling_logp_difference/mean": 0.004480669274926186, "step": 1460, "step_time": 6.938573564961553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4583333333333333, "completions/max_length": 3000.0, "completions/max_terminated_length": 2439.6666666666665, "completions/mean_length": 1706.25, "completions/mean_terminated_length": 616.1899210611979, "completions/min_length": 176.33333333333334, "completions/min_terminated_length": 176.33333333333334, "entropy": 0.021535934694111348, "epoch": 0.17668269230769232, "frac_reward_zero_std": 0.0, "grad_norm": 0.002909771166741848, "learning_rate": 8.234375e-07, "loss": 0.0001, "num_tokens": 36236493.0, "reward": 0.6232651670773824, "reward_std": 0.2752409478028615, "rewards/reward_fn/mean": 0.6232651670773824, "rewards/reward_fn/std": 0.2752409378687541, "sampling/importance_sampling_ratio/max": 1.0321370760599773, "sampling/importance_sampling_ratio/mean": 0.21891547242800394, "sampling/importance_sampling_ratio/min": 4.1316936403745785e-05, "sampling/sampling_logp_difference/max": 2.426916718482971, "sampling/sampling_logp_difference/mean": 0.004759375471621752, "step": 1470, "step_time": 10.420345912873746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 3000.0, "completions/max_terminated_length": 1818.5, "completions/mean_length": 1493.0625, "completions/mean_terminated_length": 547.9320678710938, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.024216885678470135, "epoch": 0.1778846153846154, "frac_reward_zero_std": 0.0, "grad_norm": 0.01511643547564745, "learning_rate": 8.222355769230768e-07, "loss": -0.0045, "num_tokens": 36407969.0, "reward": 0.6157480627298355, "reward_std": 0.28374165296554565, "rewards/reward_fn/mean": 0.6157480627298355, "rewards/reward_fn/std": 0.28374165296554565, "sampling/importance_sampling_ratio/max": 1.099589616060257, "sampling/importance_sampling_ratio/mean": 0.27723218500614166, "sampling/importance_sampling_ratio/min": 4.080505800629908e-05, "sampling/sampling_logp_difference/max": 2.3034810423851013, "sampling/sampling_logp_difference/mean": 0.0048214641865342855, "step": 1480, "step_time": 7.049120948836207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 3000.0, "completions/max_terminated_length": 2263.0, "completions/mean_length": 1756.2395833333333, "completions/mean_terminated_length": 495.55614217122394, "completions/min_length": 103.66666666666667, "completions/min_terminated_length": 103.66666666666667, "entropy": 0.020070930570364, "epoch": 0.17908653846153846, "frac_reward_zero_std": 0.0, "grad_norm": 0.0039961193688213825, "learning_rate": 8.210336538461539e-07, "loss": -0.0046, "num_tokens": 36674520.0, "reward": 0.5941232442855835, "reward_std": 0.29488591353098553, "rewards/reward_fn/mean": 0.5941232442855835, "rewards/reward_fn/std": 0.29488590359687805, "sampling/importance_sampling_ratio/max": 1.4310051600138347, "sampling/importance_sampling_ratio/mean": 0.3461870650450389, "sampling/importance_sampling_ratio/min": 7.814146829332458e-05, "sampling/sampling_logp_difference/max": 2.95281712214152, "sampling/sampling_logp_difference/mean": 0.00437747019653519, "step": 1490, "step_time": 10.254671798367053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2277.0, "completions/mean_length": 1826.359375, "completions/mean_terminated_length": 747.8158264160156, "completions/min_length": 133.5, "completions/min_terminated_length": 133.5, "entropy": 0.02123525459319353, "epoch": 0.18028846153846154, "frac_reward_zero_std": 0.0, "grad_norm": 0.0058107925578951836, "learning_rate": 8.198317307692307e-07, "loss": 0.0004, "num_tokens": 36854487.0, "reward": 0.6495044827461243, "reward_std": 0.2572687119245529, "rewards/reward_fn/mean": 0.6495044827461243, "rewards/reward_fn/std": 0.2572687119245529, "sampling/importance_sampling_ratio/max": 0.9959781169891357, "sampling/importance_sampling_ratio/mean": 0.22274453938007355, "sampling/importance_sampling_ratio/min": 4.531444028543774e-05, "sampling/sampling_logp_difference/max": 2.713387966156006, "sampling/sampling_logp_difference/mean": 0.004440524615347385, "step": 1500, "step_time": 7.141270723473281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4479166666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 2232.6666666666665, "completions/mean_length": 1771.8854166666667, "completions/mean_terminated_length": 773.5833740234375, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "entropy": 0.018975120130926372, "epoch": 0.1814903846153846, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.006242536939680576, "learning_rate": 8.186298076923076e-07, "loss": -0.0002, "num_tokens": 37143460.0, "reward": 0.5835822522640228, "reward_std": 0.3133925000826518, "rewards/reward_fn/mean": 0.5835822522640228, "rewards/reward_fn/std": 0.3133925100167592, "sampling/importance_sampling_ratio/max": 1.1050077080726624, "sampling/importance_sampling_ratio/mean": 0.26846445600191754, "sampling/importance_sampling_ratio/min": 0.000112733687274158, "sampling/sampling_logp_difference/max": 2.1254302263259888, "sampling/sampling_logp_difference/mean": 0.003837712419529756, "step": 1510, "step_time": 10.353138939011842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2558.5, "completions/mean_length": 1426.375, "completions/mean_terminated_length": 549.2628479003906, "completions/min_length": 130.5, "completions/min_terminated_length": 130.5, "entropy": 0.021787562314420937, "epoch": 0.18269230769230768, "frac_reward_zero_std": 0.0, "grad_norm": 0.0034128010738641024, "learning_rate": 8.174278846153846e-07, "loss": -0.002, "num_tokens": 37316516.0, "reward": 0.6963077783584595, "reward_std": 0.2626548111438751, "rewards/reward_fn/mean": 0.6963077783584595, "rewards/reward_fn/std": 0.26265479624271393, "sampling/importance_sampling_ratio/max": 1.190653145313263, "sampling/importance_sampling_ratio/mean": 0.2816203534603119, "sampling/importance_sampling_ratio/min": 0.0003580446355044842, "sampling/sampling_logp_difference/max": 1.7096443176269531, "sampling/sampling_logp_difference/mean": 0.004420998739078641, "step": 1520, "step_time": 7.141745381709188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4166666666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 1405.3333333333333, "completions/mean_length": 1479.3958333333333, "completions/mean_terminated_length": 411.422368367513, "completions/min_length": 127.33333333333333, "completions/min_terminated_length": 127.33333333333333, "entropy": 0.018425310123711826, "epoch": 0.18389423076923078, "frac_reward_zero_std": 0.16666666666666666, "grad_norm": 0.0028748787008225918, "learning_rate": 8.162259615384614e-07, "loss": 0.0091, "num_tokens": 37562058.0, "reward": 0.5760854880015055, "reward_std": 0.31298040350278217, "rewards/reward_fn/mean": 0.5760854880015055, "rewards/reward_fn/std": 0.31298040350278217, "sampling/importance_sampling_ratio/max": 1.2181646426518757, "sampling/importance_sampling_ratio/mean": 0.39910490314165753, "sampling/importance_sampling_ratio/min": 0.0007061959013299202, "sampling/sampling_logp_difference/max": 2.299155672391256, "sampling/sampling_logp_difference/mean": 0.004014244070276618, "step": 1530, "step_time": 10.270501963049174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 3000.0, "completions/max_terminated_length": 2860.0, "completions/mean_length": 1799.125, "completions/mean_terminated_length": 607.5686340332031, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.018978177197277547, "epoch": 0.18509615384615385, "frac_reward_zero_std": 0.0, "grad_norm": 0.008226425386965275, "learning_rate": 8.150240384615384e-07, "loss": -0.0026, "num_tokens": 37745026.0, "reward": 0.6200755834579468, "reward_std": 0.3187096416950226, "rewards/reward_fn/mean": 0.6200755834579468, "rewards/reward_fn/std": 0.3187096416950226, "sampling/importance_sampling_ratio/max": 2.1071826219558716, "sampling/importance_sampling_ratio/mean": 0.3048808202147484, "sampling/importance_sampling_ratio/min": 0.00029282977988032144, "sampling/sampling_logp_difference/max": 4.973349213600159, "sampling/sampling_logp_difference/mean": 0.004207737511023879, "step": 1540, "step_time": 7.1671469963155685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 3000.0, "completions/max_terminated_length": 1944.6666666666667, "completions/mean_length": 1432.9895833333333, "completions/mean_terminated_length": 644.5528259277344, "completions/min_length": 125.33333333333333, "completions/min_terminated_length": 125.33333333333333, "entropy": 0.020526463538408278, "epoch": 0.18629807692307693, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.005565175320953131, "learning_rate": 8.138221153846153e-07, "loss": 0.0029, "num_tokens": 38010705.0, "reward": 0.6584534049034119, "reward_std": 0.2805156509081523, "rewards/reward_fn/mean": 0.6584534049034119, "rewards/reward_fn/std": 0.2805156409740448, "sampling/importance_sampling_ratio/max": 1.3889702558517456, "sampling/importance_sampling_ratio/mean": 0.344974547624588, "sampling/importance_sampling_ratio/min": 0.0013348925531317946, "sampling/sampling_logp_difference/max": 2.905367930730184, "sampling/sampling_logp_difference/mean": 0.004379773357262214, "step": 1550, "step_time": 10.640483521856368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 3000.0, "completions/max_terminated_length": 1781.0, "completions/mean_length": 2105.953125, "completions/mean_terminated_length": 621.375, "completions/min_length": 241.5, "completions/min_terminated_length": 241.5, "entropy": 0.018829497788101435, "epoch": 0.1875, "frac_reward_zero_std": 0.0, "grad_norm": 0.0022328023333102465, "learning_rate": 8.126201923076923e-07, "loss": -0.0008, "num_tokens": 38215726.0, "reward": 0.5813806056976318, "reward_std": 0.24832353740930557, "rewards/reward_fn/mean": 0.5813806056976318, "rewards/reward_fn/std": 0.24832353740930557, "sampling/importance_sampling_ratio/max": 0.8762839734554291, "sampling/importance_sampling_ratio/mean": 0.1913088597357273, "sampling/importance_sampling_ratio/min": 0.0005201872263569385, "sampling/sampling_logp_difference/max": 2.1799063086509705, "sampling/sampling_logp_difference/mean": 0.0038409699918702245, "step": 1560, "step_time": 7.052625392284244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2604166666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 2261.3333333333335, "completions/mean_length": 1165.0416666666667, "completions/mean_terminated_length": 523.4166056315104, "completions/min_length": 96.33333333333333, "completions/min_terminated_length": 96.33333333333333, "entropy": 0.02060826849192381, "epoch": 0.18870192307692307, "frac_reward_zero_std": 0.0, "grad_norm": 0.012332085520029068, "learning_rate": 8.114182692307692e-07, "loss": -0.0, "num_tokens": 38431122.0, "reward": 0.6620953679084778, "reward_std": 0.29280886054039, "rewards/reward_fn/mean": 0.6620953679084778, "rewards/reward_fn/std": 0.2928088406721751, "sampling/importance_sampling_ratio/max": 1.360703667004903, "sampling/importance_sampling_ratio/mean": 0.42509161432584125, "sampling/importance_sampling_ratio/min": 0.00018924607623678943, "sampling/sampling_logp_difference/max": 1.9867877165476482, "sampling/sampling_logp_difference/mean": 0.004582547427465518, "step": 1570, "step_time": 10.33811799408868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 3000.0, "completions/max_terminated_length": 2204.5, "completions/mean_length": 1512.171875, "completions/mean_terminated_length": 564.8055725097656, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.02213138323277235, "epoch": 0.18990384615384615, "frac_reward_zero_std": 0.0, "grad_norm": 0.005174688994884491, "learning_rate": 8.102163461538462e-07, "loss": -0.0012, "num_tokens": 38592589.0, "reward": 0.7159659564495087, "reward_std": 0.25119882076978683, "rewards/reward_fn/mean": 0.7159659564495087, "rewards/reward_fn/std": 0.25119880586862564, "sampling/importance_sampling_ratio/max": 1.1298002004623413, "sampling/importance_sampling_ratio/mean": 0.28383538872003555, "sampling/importance_sampling_ratio/min": 0.0002727817263803445, "sampling/sampling_logp_difference/max": 1.910941243171692, "sampling/sampling_logp_difference/mean": 0.004584843525663018, "step": 1580, "step_time": 7.021651083882898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.22916666666666666, "completions/max_length": 3000.0, "completions/max_terminated_length": 1624.0, "completions/mean_length": 1030.0, "completions/mean_terminated_length": 472.66668192545575, "completions/min_length": 97.66666666666667, "completions/min_terminated_length": 97.66666666666667, "entropy": 0.02183762602508068, "epoch": 0.19110576923076922, "frac_reward_zero_std": 0.0, "grad_norm": 0.03083266131579876, "learning_rate": 8.09014423076923e-07, "loss": 0.001, "num_tokens": 38790749.0, "reward": 0.672552764415741, "reward_std": 0.3112578938404719, "rewards/reward_fn/mean": 0.672552764415741, "rewards/reward_fn/std": 0.31125788390636444, "sampling/importance_sampling_ratio/max": 1.5774198373158772, "sampling/importance_sampling_ratio/mean": 0.4906708200772603, "sampling/importance_sampling_ratio/min": 0.00021521816294504484, "sampling/sampling_logp_difference/max": 1.3702741861343384, "sampling/sampling_logp_difference/mean": 0.004920999053865671, "step": 1590, "step_time": 9.975240693055094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2135.0, "completions/mean_length": 1749.640625, "completions/mean_terminated_length": 682.3166809082031, "completions/min_length": 152.5, "completions/min_terminated_length": 152.5, "entropy": 0.02398629654198885, "epoch": 0.19230769230769232, "frac_reward_zero_std": 0.0, "grad_norm": 0.03316458314657211, "learning_rate": 8.078125e-07, "loss": -0.0209, "num_tokens": 38962766.0, "reward": 0.6671689450740814, "reward_std": 0.26754381507635117, "rewards/reward_fn/mean": 0.6671689450740814, "rewards/reward_fn/std": 0.26754381507635117, "sampling/importance_sampling_ratio/max": 1.541327714920044, "sampling/importance_sampling_ratio/mean": 0.2191411405801773, "sampling/importance_sampling_ratio/min": 2.513903865519751e-06, "sampling/sampling_logp_difference/max": 2.6608888506889343, "sampling/sampling_logp_difference/mean": 0.00517250201664865, "step": 1600, "step_time": 6.999857087619603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4270833333333333, "completions/max_length": 3000.0, "completions/max_terminated_length": 1908.3333333333333, "completions/mean_length": 1557.0416666666667, "completions/mean_terminated_length": 535.129628499349, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.018966123275458813, "epoch": 0.1935096153846154, "frac_reward_zero_std": 0.0, "grad_norm": 0.007622858509421349, "learning_rate": 8.066105769230769e-07, "loss": -0.0004, "num_tokens": 39217162.0, "reward": 0.6377670367558798, "reward_std": 0.2693122923374176, "rewards/reward_fn/mean": 0.6377670367558798, "rewards/reward_fn/std": 0.2693122973044713, "sampling/importance_sampling_ratio/max": 0.9369579752286276, "sampling/importance_sampling_ratio/mean": 0.32015671332677204, "sampling/importance_sampling_ratio/min": 9.517968207243636e-05, "sampling/sampling_logp_difference/max": 4.360260446866353, "sampling/sampling_logp_difference/mean": 0.0045287572623540955, "step": 1610, "step_time": 10.169157881569117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 3000.0, "completions/max_terminated_length": 1881.0, "completions/mean_length": 1897.875, "completions/mean_terminated_length": 648.8000335693359, "completions/min_length": 122.5, "completions/min_terminated_length": 122.5, "entropy": 0.020911477878689765, "epoch": 0.19471153846153846, "frac_reward_zero_std": 0.0, "grad_norm": 0.0031333996448665857, "learning_rate": 8.054086538461538e-07, "loss": -0.0023, "num_tokens": 39395506.0, "reward": 0.545472577214241, "reward_std": 0.29300789535045624, "rewards/reward_fn/mean": 0.545472577214241, "rewards/reward_fn/std": 0.29300789535045624, "sampling/importance_sampling_ratio/max": 1.542718768119812, "sampling/importance_sampling_ratio/mean": 0.2679113522171974, "sampling/importance_sampling_ratio/min": 4.0848575736163184e-05, "sampling/sampling_logp_difference/max": 2.453081429004669, "sampling/sampling_logp_difference/mean": 0.004542592214420438, "step": 1620, "step_time": 7.142730783019215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5104166666666666, "completions/max_length": 3000.0, "completions/max_terminated_length": 2421.3333333333335, "completions/mean_length": 1810.9583333333333, "completions/mean_terminated_length": 573.6329040527344, "completions/min_length": 150.33333333333334, "completions/min_terminated_length": 150.33333333333334, "entropy": 0.01836366709321737, "epoch": 0.19591346153846154, "frac_reward_zero_std": 0.0, "grad_norm": 0.007233439479023218, "learning_rate": 8.042067307692308e-07, "loss": 0.0042, "num_tokens": 39674774.0, "reward": 0.5448157588640848, "reward_std": 0.3016149550676346, "rewards/reward_fn/mean": 0.5448157588640848, "rewards/reward_fn/std": 0.3016149451335271, "sampling/importance_sampling_ratio/max": 1.0180341800053914, "sampling/importance_sampling_ratio/mean": 0.27356448769569397, "sampling/importance_sampling_ratio/min": 3.473014051754338e-05, "sampling/sampling_logp_difference/max": 3.0472635428110757, "sampling/sampling_logp_difference/mean": 0.0038481197940806546, "step": 1630, "step_time": 10.500119385030121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 3000.0, "completions/max_terminated_length": 1558.5, "completions/mean_length": 1468.875, "completions/mean_terminated_length": 445.9362487792969, "completions/min_length": 143.5, "completions/min_terminated_length": 143.5, "entropy": 0.021448279730975628, "epoch": 0.1971153846153846, "frac_reward_zero_std": 0.0, "grad_norm": 0.016534829512238503, "learning_rate": 8.030048076923076e-07, "loss": -0.0013, "num_tokens": 39856854.0, "reward": 0.688113808631897, "reward_std": 0.2576809972524643, "rewards/reward_fn/mean": 0.688113808631897, "rewards/reward_fn/std": 0.2576809898018837, "sampling/importance_sampling_ratio/max": 1.5628597140312195, "sampling/importance_sampling_ratio/mean": 0.30359068512916565, "sampling/importance_sampling_ratio/min": 1.4969801597430887e-05, "sampling/sampling_logp_difference/max": 2.1224790811538696, "sampling/sampling_logp_difference/mean": 0.0051246099174022675, "step": 1640, "step_time": 7.243328231759369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4791666666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 1581.6666666666667, "completions/mean_length": 1724.0104166666667, "completions/mean_terminated_length": 535.8127136230469, "completions/min_length": 131.66666666666666, "completions/min_terminated_length": 131.66666666666666, "entropy": 0.02387437950819731, "epoch": 0.19831730769230768, "frac_reward_zero_std": 0.0, "grad_norm": 0.006160501856356859, "learning_rate": 8.018028846153845e-07, "loss": 0.0031, "num_tokens": 40139271.0, "reward": 0.6037543416023254, "reward_std": 0.2712361713250478, "rewards/reward_fn/mean": 0.6037543416023254, "rewards/reward_fn/std": 0.2712361713250478, "sampling/importance_sampling_ratio/max": 1.084092954794566, "sampling/importance_sampling_ratio/mean": 0.2330683469772339, "sampling/importance_sampling_ratio/min": 5.7477650140450955e-05, "sampling/sampling_logp_difference/max": 2.676363229751587, "sampling/sampling_logp_difference/mean": 0.004258855323617657, "step": 1650, "step_time": 10.357042885199188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.578125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2010.5, "completions/mean_length": 2009.1875, "completions/mean_terminated_length": 761.0785827636719, "completions/min_length": 337.5, "completions/min_terminated_length": 337.5, "entropy": 0.01976499566808343, "epoch": 0.19951923076923078, "frac_reward_zero_std": 0.0, "grad_norm": 0.005487876478582621, "learning_rate": 8.006009615384615e-07, "loss": 0.0019, "num_tokens": 40341355.0, "reward": 0.5804395377635956, "reward_std": 0.26441681385040283, "rewards/reward_fn/mean": 0.5804395377635956, "rewards/reward_fn/std": 0.26441680639982224, "sampling/importance_sampling_ratio/max": 1.0031552016735077, "sampling/importance_sampling_ratio/mean": 0.17337845638394356, "sampling/importance_sampling_ratio/min": 7.095629916875623e-05, "sampling/sampling_logp_difference/max": 2.5027137398719788, "sampling/sampling_logp_difference/mean": 0.004227422294206917, "step": 1660, "step_time": 7.157767170015722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2033.6666666666667, "completions/mean_length": 1603.3541666666667, "completions/mean_terminated_length": 503.2454427083333, "completions/min_length": 157.33333333333334, "completions/min_terminated_length": 157.33333333333334, "entropy": 0.02247651815414429, "epoch": 0.20072115384615385, "frac_reward_zero_std": 0.0, "grad_norm": 0.0017852706369012594, "learning_rate": 7.993990384615384e-07, "loss": -0.0022, "num_tokens": 40600493.0, "reward": 0.635827879110972, "reward_std": 0.2819975366195043, "rewards/reward_fn/mean": 0.635827879110972, "rewards/reward_fn/std": 0.2819975366195043, "sampling/importance_sampling_ratio/max": 1.15144948164622, "sampling/importance_sampling_ratio/mean": 0.2568851088484128, "sampling/importance_sampling_ratio/min": 6.632745529107827e-06, "sampling/sampling_logp_difference/max": 2.6611141363779702, "sampling/sampling_logp_difference/mean": 0.00513139832764864, "step": 1670, "step_time": 10.345567165408283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.515625, "completions/max_length": 3000.0, "completions/max_terminated_length": 928.5, "completions/mean_length": 1762.921875, "completions/mean_terminated_length": 397.75999450683594, "completions/min_length": 124.5, "completions/min_terminated_length": 124.5, "entropy": 0.02012787330895662, "epoch": 0.20192307692307693, "frac_reward_zero_std": 0.125, "grad_norm": 0.001319584553129971, "learning_rate": 7.981971153846153e-07, "loss": -0.0013, "num_tokens": 40777424.0, "reward": 0.4968307912349701, "reward_std": 0.32482050359249115, "rewards/reward_fn/mean": 0.4968307912349701, "rewards/reward_fn/std": 0.32482050359249115, "sampling/importance_sampling_ratio/max": 1.113398551940918, "sampling/importance_sampling_ratio/mean": 0.27985796704888344, "sampling/importance_sampling_ratio/min": 0.00017872573494059907, "sampling/sampling_logp_difference/max": 5.312577247619629, "sampling/sampling_logp_difference/mean": 0.003959161113016307, "step": 1680, "step_time": 7.103970221802593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 3000.0, "completions/max_terminated_length": 1474.3333333333333, "completions/mean_length": 1525.9166666666667, "completions/mean_terminated_length": 396.3780517578125, "completions/min_length": 115.66666666666667, "completions/min_terminated_length": 115.66666666666667, "entropy": 0.02188712954521179, "epoch": 0.203125, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.005401904229074717, "learning_rate": 7.969951923076923e-07, "loss": -0.003, "num_tokens": 41023776.0, "reward": 0.6589902440706888, "reward_std": 0.26409218708674115, "rewards/reward_fn/mean": 0.6589902440706888, "rewards/reward_fn/std": 0.26409218708674115, "sampling/importance_sampling_ratio/max": 1.172942539056142, "sampling/importance_sampling_ratio/mean": 0.29869075616200763, "sampling/importance_sampling_ratio/min": 2.719990646937731e-05, "sampling/sampling_logp_difference/max": 3.0894165436426797, "sampling/sampling_logp_difference/mean": 0.004355310928076506, "step": 1690, "step_time": 10.316285741236062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 3000.0, "completions/max_terminated_length": 2024.5, "completions/mean_length": 1595.671875, "completions/mean_terminated_length": 658.0994567871094, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.0217608829960227, "epoch": 0.20432692307692307, "frac_reward_zero_std": 0.0, "grad_norm": 0.005172002594918013, "learning_rate": 7.957932692307692e-07, "loss": 0.0, "num_tokens": 41207547.0, "reward": 0.658524364233017, "reward_std": 0.2797420620918274, "rewards/reward_fn/mean": 0.658524364233017, "rewards/reward_fn/std": 0.2797420769929886, "sampling/importance_sampling_ratio/max": 0.8159090876579285, "sampling/importance_sampling_ratio/mean": 0.260173037648201, "sampling/importance_sampling_ratio/min": 8.409318525082199e-06, "sampling/sampling_logp_difference/max": 5.436912178993225, "sampling/sampling_logp_difference/mean": 0.004256355110555887, "step": 1700, "step_time": 7.150232383422553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2599.3333333333335, "completions/mean_length": 1457.9479166666667, "completions/mean_terminated_length": 538.2091267903646, "completions/min_length": 145.66666666666666, "completions/min_terminated_length": 145.66666666666666, "entropy": 0.016860764659941196, "epoch": 0.20552884615384615, "frac_reward_zero_std": 0.16666666666666666, "grad_norm": 0.0024633980356156826, "learning_rate": 7.945913461538462e-07, "loss": 0.0126, "num_tokens": 41448870.0, "reward": 0.6271339456240336, "reward_std": 0.302998165289561, "rewards/reward_fn/mean": 0.6271339456240336, "rewards/reward_fn/std": 0.3029981702566147, "sampling/importance_sampling_ratio/max": 1.0717326005299885, "sampling/importance_sampling_ratio/mean": 0.37634633978207904, "sampling/importance_sampling_ratio/min": 0.00010963601380353794, "sampling/sampling_logp_difference/max": 1.7096278667449951, "sampling/sampling_logp_difference/mean": 0.003415194728101293, "step": 1710, "step_time": 10.39299349244684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 3000.0, "completions/max_terminated_length": 1842.0, "completions/mean_length": 1295.484375, "completions/mean_terminated_length": 464.04005432128906, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.023536110669374465, "epoch": 0.20673076923076922, "frac_reward_zero_std": 0.0, "grad_norm": 0.015478919260203838, "learning_rate": 7.93389423076923e-07, "loss": -0.0025, "num_tokens": 41598717.0, "reward": 0.6219582259654999, "reward_std": 0.30681225657463074, "rewards/reward_fn/mean": 0.6219582259654999, "rewards/reward_fn/std": 0.30681227147579193, "sampling/importance_sampling_ratio/max": 1.2923569679260254, "sampling/importance_sampling_ratio/mean": 0.3451686203479767, "sampling/importance_sampling_ratio/min": 1.3078911251795944e-05, "sampling/sampling_logp_difference/max": 2.849321722984314, "sampling/sampling_logp_difference/mean": 0.005497604142874479, "step": 1720, "step_time": 6.983422925136983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4583333333333333, "completions/max_length": 3000.0, "completions/max_terminated_length": 2164.3333333333335, "completions/mean_length": 1763.3958333333333, "completions/mean_terminated_length": 735.2293904622396, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.02065535429865122, "epoch": 0.20793269230769232, "frac_reward_zero_std": 0.0, "grad_norm": 0.006271590944379568, "learning_rate": 7.921875e-07, "loss": 0.002, "num_tokens": 41878843.0, "reward": 0.650432805220286, "reward_std": 0.2705332239468892, "rewards/reward_fn/mean": 0.650432805220286, "rewards/reward_fn/std": 0.2705332239468892, "sampling/importance_sampling_ratio/max": 1.2378543217976887, "sampling/importance_sampling_ratio/mean": 0.21657354633013406, "sampling/importance_sampling_ratio/min": 0.00023024361068261592, "sampling/sampling_logp_difference/max": 3.141491969426473, "sampling/sampling_logp_difference/mean": 0.004575273798157771, "step": 1730, "step_time": 10.533759900182485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 3000.0, "completions/max_terminated_length": 1252.5, "completions/mean_length": 1144.15625, "completions/mean_terminated_length": 387.7418975830078, "completions/min_length": 99.5, "completions/min_terminated_length": 99.5, "entropy": 0.020292087644338607, "epoch": 0.2091346153846154, "frac_reward_zero_std": 0.125, "grad_norm": 0.005172231700271368, "learning_rate": 7.909855769230769e-07, "loss": 0.0038, "num_tokens": 42025637.0, "reward": 0.6568903625011444, "reward_std": 0.344398632645607, "rewards/reward_fn/mean": 0.6568903625011444, "rewards/reward_fn/std": 0.3443986475467682, "sampling/importance_sampling_ratio/max": 1.8669258952140808, "sampling/importance_sampling_ratio/mean": 0.508664533495903, "sampling/importance_sampling_ratio/min": 0.00016573482207604684, "sampling/sampling_logp_difference/max": 1.811318814754486, "sampling/sampling_logp_difference/mean": 0.004262521164491773, "step": 1740, "step_time": 7.001714694593102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 3000.0, "completions/max_terminated_length": 2510.6666666666665, "completions/mean_length": 1873.53125, "completions/mean_terminated_length": 744.6759440104166, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.02177187167108059, "epoch": 0.21033653846153846, "frac_reward_zero_std": 0.0, "grad_norm": 0.0017939542885869741, "learning_rate": 7.897836538461539e-07, "loss": 0.0097, "num_tokens": 42324008.0, "reward": 0.5963997642199198, "reward_std": 0.2981703579425812, "rewards/reward_fn/mean": 0.5963997642199198, "rewards/reward_fn/std": 0.2981703480084737, "sampling/importance_sampling_ratio/max": 1.091670572757721, "sampling/importance_sampling_ratio/mean": 0.23759767909844717, "sampling/importance_sampling_ratio/min": 5.251628967547125e-05, "sampling/sampling_logp_difference/max": 4.614471832911174, "sampling/sampling_logp_difference/mean": 0.004427086561918259, "step": 1750, "step_time": 10.595327524747699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 3000.0, "completions/max_terminated_length": 1867.5, "completions/mean_length": 1526.265625, "completions/mean_terminated_length": 723.2500457763672, "completions/min_length": 178.5, "completions/min_terminated_length": 178.5, "entropy": 0.022182421386241914, "epoch": 0.21153846153846154, "frac_reward_zero_std": 0.0, "grad_norm": 0.005623992532491684, "learning_rate": 7.885817307692307e-07, "loss": 0.0, "num_tokens": 42493633.0, "reward": 0.7064787149429321, "reward_std": 0.2480490803718567, "rewards/reward_fn/mean": 0.7064787149429321, "rewards/reward_fn/std": 0.2480490580201149, "sampling/importance_sampling_ratio/max": 0.8731032013893127, "sampling/importance_sampling_ratio/mean": 0.2175261750817299, "sampling/importance_sampling_ratio/min": 0.0012434253633273329, "sampling/sampling_logp_difference/max": 3.2755895853042603, "sampling/sampling_logp_difference/mean": 0.004895105725154281, "step": 1760, "step_time": 7.154489532485604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 3000.0, "completions/max_terminated_length": 2292.0, "completions/mean_length": 1724.5208333333333, "completions/mean_terminated_length": 604.2483622233073, "completions/min_length": 137.33333333333334, "completions/min_terminated_length": 137.33333333333334, "entropy": 0.020865775365382432, "epoch": 0.2127403846153846, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.0028558997437357903, "learning_rate": 7.873798076923077e-07, "loss": -0.0031, "num_tokens": 42765075.0, "reward": 0.6836751302083334, "reward_std": 0.2558184067408244, "rewards/reward_fn/mean": 0.6836751302083334, "rewards/reward_fn/std": 0.2558184117078781, "sampling/importance_sampling_ratio/max": 1.0247146884600322, "sampling/importance_sampling_ratio/mean": 0.2504562934239705, "sampling/importance_sampling_ratio/min": 1.2245528703639745e-06, "sampling/sampling_logp_difference/max": 6.124047915140788, "sampling/sampling_logp_difference/mean": 0.004303150189419587, "step": 1770, "step_time": 10.513908832520247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.640625, "completions/max_length": 3000.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 2136.140625, "completions/mean_terminated_length": 723.3944549560547, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "entropy": 0.01980966590344906, "epoch": 0.21394230769230768, "frac_reward_zero_std": 0.0, "grad_norm": 0.001577327842824161, "learning_rate": 7.861778846153846e-07, "loss": 0.0006, "num_tokens": 42980332.0, "reward": 0.552566409111023, "reward_std": 0.24059820920228958, "rewards/reward_fn/mean": 0.552566409111023, "rewards/reward_fn/std": 0.2405981868505478, "sampling/importance_sampling_ratio/max": 1.413317620754242, "sampling/importance_sampling_ratio/mean": 0.20498643070459366, "sampling/importance_sampling_ratio/min": 3.626771513154381e-05, "sampling/sampling_logp_difference/max": 2.391763210296631, "sampling/sampling_logp_difference/mean": 0.004364197375252843, "step": 1780, "step_time": 7.216023504547775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4166666666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 1666.0, "completions/mean_length": 1623.9479166666667, "completions/mean_terminated_length": 632.3645222981771, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.021948422025889158, "epoch": 0.21514423076923078, "frac_reward_zero_std": 0.0, "grad_norm": 0.0016523796366527677, "learning_rate": 7.849759615384614e-07, "loss": 0.0162, "num_tokens": 43227527.0, "reward": 0.6010072628657023, "reward_std": 0.31461278597513836, "rewards/reward_fn/mean": 0.6010072628657023, "rewards/reward_fn/std": 0.31461280584335327, "sampling/importance_sampling_ratio/max": 1.1194902459780376, "sampling/importance_sampling_ratio/mean": 0.24881253143151602, "sampling/importance_sampling_ratio/min": 8.282244289148366e-05, "sampling/sampling_logp_difference/max": 3.6579667727152505, "sampling/sampling_logp_difference/mean": 0.004951321830352147, "step": 1790, "step_time": 10.276007424853741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2068.0, "completions/mean_length": 1353.140625, "completions/mean_terminated_length": 546.0554351806641, "completions/min_length": 139.5, "completions/min_terminated_length": 139.5, "entropy": 0.022743909806013107, "epoch": 0.21634615384615385, "frac_reward_zero_std": 0.125, "grad_norm": 0.004119078628718853, "learning_rate": 7.837740384615385e-07, "loss": -0.0036, "num_tokens": 43388104.0, "reward": 0.6980603039264679, "reward_std": 0.2960897535085678, "rewards/reward_fn/mean": 0.6980603039264679, "rewards/reward_fn/std": 0.2960897535085678, "sampling/importance_sampling_ratio/max": 1.1816225945949554, "sampling/importance_sampling_ratio/mean": 0.32987505197525024, "sampling/importance_sampling_ratio/min": 5.857692713107099e-06, "sampling/sampling_logp_difference/max": 4.0047526359558105, "sampling/sampling_logp_difference/mean": 0.004863352747634053, "step": 1800, "step_time": 7.108362975157798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.23958333333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 1906.3333333333333, "completions/mean_length": 1023.0208333333334, "completions/mean_terminated_length": 397.69935099283856, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.02365876641124487, "epoch": 0.21754807692307693, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.012883143499493599, "learning_rate": 7.825721153846153e-07, "loss": -0.0093, "num_tokens": 43610546.0, "reward": 0.6543925205866495, "reward_std": 0.3222581148147583, "rewards/reward_fn/mean": 0.6543925205866495, "rewards/reward_fn/std": 0.3222580999135971, "sampling/importance_sampling_ratio/max": 1.7208409309387207, "sampling/importance_sampling_ratio/mean": 0.49251843492190045, "sampling/importance_sampling_ratio/min": 0.00013922992790564118, "sampling/sampling_logp_difference/max": 3.0631954669952393, "sampling/sampling_logp_difference/mean": 0.004888636680940787, "step": 1810, "step_time": 10.151780189294367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 3000.0, "completions/max_terminated_length": 2762.0, "completions/mean_length": 1641.109375, "completions/mean_terminated_length": 828.7458801269531, "completions/min_length": 209.5, "completions/min_terminated_length": 209.5, "entropy": 0.015720922127366067, "epoch": 0.21875, "frac_reward_zero_std": 0.125, "grad_norm": 0.00339904916472733, "learning_rate": 7.813701923076923e-07, "loss": -0.0045, "num_tokens": 43784881.0, "reward": 0.5286347717046738, "reward_std": 0.3548456132411957, "rewards/reward_fn/mean": 0.5286347717046738, "rewards/reward_fn/std": 0.3548456132411957, "sampling/importance_sampling_ratio/max": 1.1511951684951782, "sampling/importance_sampling_ratio/mean": 0.3653012663125992, "sampling/importance_sampling_ratio/min": 4.3751616260578885e-05, "sampling/sampling_logp_difference/max": 4.276604175567627, "sampling/sampling_logp_difference/mean": 0.0034619306679815054, "step": 1820, "step_time": 7.030290784500539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3229166666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 2448.3333333333335, "completions/mean_length": 1440.3645833333333, "completions/mean_terminated_length": 774.9297180175781, "completions/min_length": 133.33333333333334, "completions/min_terminated_length": 133.33333333333334, "entropy": 0.022757479548454286, "epoch": 0.21995192307692307, "frac_reward_zero_std": 0.0, "grad_norm": 0.014561628922820091, "learning_rate": 7.801682692307692e-07, "loss": -0.0044, "num_tokens": 44018036.0, "reward": 0.6418046156565348, "reward_std": 0.30111566185951233, "rewards/reward_fn/mean": 0.6418046156565348, "rewards/reward_fn/std": 0.30111566185951233, "sampling/importance_sampling_ratio/max": 1.86243736743927, "sampling/importance_sampling_ratio/mean": 0.3764208257198334, "sampling/importance_sampling_ratio/min": 0.0007715753914302089, "sampling/sampling_logp_difference/max": 3.025995453198751, "sampling/sampling_logp_difference/mean": 0.004635944962501526, "step": 1830, "step_time": 10.186713667958974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 3000.0, "completions/max_terminated_length": 2866.5, "completions/mean_length": 1752.1875, "completions/mean_terminated_length": 678.1333618164062, "completions/min_length": 145.5, "completions/min_terminated_length": 145.5, "entropy": 0.02299376130104065, "epoch": 0.22115384615384615, "frac_reward_zero_std": 0.0, "grad_norm": 0.0051248823292553425, "learning_rate": 7.789663461538461e-07, "loss": 0.0011, "num_tokens": 44213736.0, "reward": 0.6013289391994476, "reward_std": 0.29438523948192596, "rewards/reward_fn/mean": 0.6013289391994476, "rewards/reward_fn/std": 0.29438523948192596, "sampling/importance_sampling_ratio/max": 0.9077131748199463, "sampling/importance_sampling_ratio/mean": 0.22360143065452576, "sampling/importance_sampling_ratio/min": 1.1267289551142312e-05, "sampling/sampling_logp_difference/max": 2.537571966648102, "sampling/sampling_logp_difference/mean": 0.004825479816645384, "step": 1840, "step_time": 7.222395991720259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3229166666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 2463.0, "completions/mean_length": 1483.5729166666667, "completions/mean_terminated_length": 763.3224690755209, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.02653982415795326, "epoch": 0.22235576923076922, "frac_reward_zero_std": 0.0, "grad_norm": 0.0038309141527861357, "learning_rate": 7.77764423076923e-07, "loss": -0.0011, "num_tokens": 44459135.0, "reward": 0.7304226954778036, "reward_std": 0.25978363553682965, "rewards/reward_fn/mean": 0.7304226954778036, "rewards/reward_fn/std": 0.25978363553682965, "sampling/importance_sampling_ratio/max": 2.202442208925883, "sampling/importance_sampling_ratio/mean": 0.34264151255289715, "sampling/importance_sampling_ratio/min": 1.2180654569723023e-05, "sampling/sampling_logp_difference/max": 3.6345282395680747, "sampling/sampling_logp_difference/mean": 0.005293254119654496, "step": 1850, "step_time": 10.181809578090906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 3000.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 1528.4375, "completions/mean_terminated_length": 458.1816864013672, "completions/min_length": 128.5, "completions/min_terminated_length": 128.5, "entropy": 0.020904421992599963, "epoch": 0.22355769230769232, "frac_reward_zero_std": 0.0, "grad_norm": 0.006468642968684435, "learning_rate": 7.765625e-07, "loss": 0.0334, "num_tokens": 44623171.0, "reward": 0.6843212842941284, "reward_std": 0.27079035341739655, "rewards/reward_fn/mean": 0.6843212842941284, "rewards/reward_fn/std": 0.27079035341739655, "sampling/importance_sampling_ratio/max": 1.2015548348426819, "sampling/importance_sampling_ratio/mean": 0.3149920031428337, "sampling/importance_sampling_ratio/min": 2.1180923113206518e-05, "sampling/sampling_logp_difference/max": 2.960035562515259, "sampling/sampling_logp_difference/mean": 0.004117492353543639, "step": 1860, "step_time": 6.975805634912104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2708333333333333, "completions/max_length": 3000.0, "completions/max_terminated_length": 2078.0, "completions/mean_length": 1227.0833333333333, "completions/mean_terminated_length": 581.9448649088541, "completions/min_length": 135.33333333333334, "completions/min_terminated_length": 135.33333333333334, "entropy": 0.02192576229572296, "epoch": 0.2247596153846154, "frac_reward_zero_std": 0.0, "grad_norm": 0.00693828659132123, "learning_rate": 7.753605769230769e-07, "loss": 0.0304, "num_tokens": 44826819.0, "reward": 0.7271009087562561, "reward_std": 0.24405794342358908, "rewards/reward_fn/mean": 0.7271009087562561, "rewards/reward_fn/std": 0.244057963291804, "sampling/importance_sampling_ratio/max": 1.5408448775609334, "sampling/importance_sampling_ratio/mean": 0.3741447329521179, "sampling/importance_sampling_ratio/min": 0.0001681436418342249, "sampling/sampling_logp_difference/max": 1.6228188673655193, "sampling/sampling_logp_difference/mean": 0.0044375955282400055, "step": 1870, "step_time": 10.111370871588587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 3000.0, "completions/max_terminated_length": 2547.0, "completions/mean_length": 2008.96875, "completions/mean_terminated_length": 774.5625, "completions/min_length": 107.5, "completions/min_terminated_length": 107.5, "entropy": 0.020655788108706476, "epoch": 0.22596153846153846, "frac_reward_zero_std": 0.0, "grad_norm": 0.003865031059831381, "learning_rate": 7.741586538461539e-07, "loss": -0.0045, "num_tokens": 45032745.0, "reward": 0.5834526121616364, "reward_std": 0.2787751778960228, "rewards/reward_fn/mean": 0.5834526121616364, "rewards/reward_fn/std": 0.2787751629948616, "sampling/importance_sampling_ratio/max": 1.4599666595458984, "sampling/importance_sampling_ratio/mean": 0.21127133816480637, "sampling/importance_sampling_ratio/min": 9.282902738050325e-06, "sampling/sampling_logp_difference/max": 2.7449283599853516, "sampling/sampling_logp_difference/mean": 0.004173149121925235, "step": 1880, "step_time": 7.231649817619473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3958333333333333, "completions/max_length": 3000.0, "completions/max_terminated_length": 1835.3333333333333, "completions/mean_length": 1544.2604166666667, "completions/mean_terminated_length": 582.2592569986979, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.02232257425785065, "epoch": 0.22716346153846154, "frac_reward_zero_std": 0.0, "grad_norm": 0.0038119701202958822, "learning_rate": 7.729567307692308e-07, "loss": -0.0041, "num_tokens": 45278778.0, "reward": 0.6578930815060934, "reward_std": 0.29029299815495807, "rewards/reward_fn/mean": 0.6578930815060934, "rewards/reward_fn/std": 0.29029300808906555, "sampling/importance_sampling_ratio/max": 1.3111144701639812, "sampling/importance_sampling_ratio/mean": 0.29375553131103516, "sampling/importance_sampling_ratio/min": 0.0002899061843966895, "sampling/sampling_logp_difference/max": 4.729866981506348, "sampling/sampling_logp_difference/mean": 0.004694852357109387, "step": 1890, "step_time": 10.423506897035987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2144.5, "completions/mean_length": 1398.53125, "completions/mean_terminated_length": 496.69444274902344, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.026418645679950715, "epoch": 0.2283653846153846, "frac_reward_zero_std": 0.0, "grad_norm": 0.00938359834253788, "learning_rate": 7.717548076923076e-07, "loss": -0.0047, "num_tokens": 45435348.0, "reward": 0.6881656050682068, "reward_std": 0.24387121200561523, "rewards/reward_fn/mean": 0.6881656050682068, "rewards/reward_fn/std": 0.24387121945619583, "sampling/importance_sampling_ratio/max": 1.0731061697006226, "sampling/importance_sampling_ratio/mean": 0.334454208612442, "sampling/importance_sampling_ratio/min": 0.0001019465048557322, "sampling/sampling_logp_difference/max": 1.8751838207244873, "sampling/sampling_logp_difference/mean": 0.0058221458457410336, "step": 1900, "step_time": 7.14668694883585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4270833333333333, "completions/max_length": 3000.0, "completions/max_terminated_length": 1622.0, "completions/mean_length": 1505.9375, "completions/mean_terminated_length": 404.6011555989583, "completions/min_length": 129.66666666666666, "completions/min_terminated_length": 129.66666666666666, "entropy": 0.021599502582103015, "epoch": 0.22956730769230768, "frac_reward_zero_std": 0.0, "grad_norm": 0.004030511248856783, "learning_rate": 7.705528846153846e-07, "loss": -0.0041, "num_tokens": 45699558.0, "reward": 0.6726020773251852, "reward_std": 0.2530125677585602, "rewards/reward_fn/mean": 0.6726020773251852, "rewards/reward_fn/std": 0.25301256279150647, "sampling/importance_sampling_ratio/max": 1.1876376271247864, "sampling/importance_sampling_ratio/mean": 0.33415038386980694, "sampling/importance_sampling_ratio/min": 0.00013082211141105896, "sampling/sampling_logp_difference/max": 2.7499047120412192, "sampling/sampling_logp_difference/mean": 0.004574586792538564, "step": 1910, "step_time": 10.368375700339675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2558.5, "completions/mean_length": 1510.21875, "completions/mean_terminated_length": 616.3500061035156, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.02553655058145523, "epoch": 0.23076923076923078, "frac_reward_zero_std": 0.0, "grad_norm": 0.010807781480252743, "learning_rate": 7.693509615384615e-07, "loss": -0.0063, "num_tokens": 45877948.0, "reward": 0.6862945854663849, "reward_std": 0.22726966440677643, "rewards/reward_fn/mean": 0.6862945854663849, "rewards/reward_fn/std": 0.22726966440677643, "sampling/importance_sampling_ratio/max": 1.1985666155815125, "sampling/importance_sampling_ratio/mean": 0.2757994681596756, "sampling/importance_sampling_ratio/min": 3.362066195222724e-05, "sampling/sampling_logp_difference/max": 2.154352605342865, "sampling/sampling_logp_difference/mean": 0.005315361777320504, "step": 1920, "step_time": 7.033246487472207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 3000.0, "completions/max_terminated_length": 1578.3333333333333, "completions/mean_length": 1509.4375, "completions/mean_terminated_length": 378.54412841796875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.019934075698256493, "epoch": 0.23197115384615385, "frac_reward_zero_std": 0.0, "grad_norm": 0.005220526363700628, "learning_rate": 7.681490384615384e-07, "loss": -0.0051, "num_tokens": 46149694.0, "reward": 0.6705043911933899, "reward_std": 0.25508413712183636, "rewards/reward_fn/mean": 0.6705043911933899, "rewards/reward_fn/std": 0.2550841321547826, "sampling/importance_sampling_ratio/max": 1.9342271089553833, "sampling/importance_sampling_ratio/mean": 0.3827037264903386, "sampling/importance_sampling_ratio/min": 8.372569883855856e-05, "sampling/sampling_logp_difference/max": 3.5371061166127524, "sampling/sampling_logp_difference/mean": 0.004418620063612859, "step": 1930, "step_time": 10.416049070004373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.578125, "completions/max_length": 3000.0, "completions/max_terminated_length": 1792.5, "completions/mean_length": 2032.015625, "completions/mean_terminated_length": 930.3618469238281, "completions/min_length": 368.5, "completions/min_terminated_length": 368.5, "entropy": 0.020052506867796183, "epoch": 0.23317307692307693, "frac_reward_zero_std": 0.0, "grad_norm": 0.000936623546294868, "learning_rate": 7.669471153846154e-07, "loss": 0.0089, "num_tokens": 46345967.0, "reward": 0.5612480044364929, "reward_std": 0.27246256172657013, "rewards/reward_fn/mean": 0.5612480044364929, "rewards/reward_fn/std": 0.2724625766277313, "sampling/importance_sampling_ratio/max": 1.0094264447689056, "sampling/importance_sampling_ratio/mean": 0.20078370720148087, "sampling/importance_sampling_ratio/min": 1.4886305962136248e-05, "sampling/sampling_logp_difference/max": 4.013256549835205, "sampling/sampling_logp_difference/mean": 0.004424422164447606, "step": 1940, "step_time": 7.1097986754961315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4270833333333333, "completions/max_length": 3000.0, "completions/max_terminated_length": 2627.3333333333335, "completions/mean_length": 1760.90625, "completions/mean_terminated_length": 805.9557291666666, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.023102945275604726, "epoch": 0.234375, "frac_reward_zero_std": 0.0, "grad_norm": 0.0011826003901660442, "learning_rate": 7.657451923076923e-07, "loss": -0.0145, "num_tokens": 46629894.0, "reward": 0.6165264050165812, "reward_std": 0.2973473270734151, "rewards/reward_fn/mean": 0.6165264050165812, "rewards/reward_fn/std": 0.2973473072052002, "sampling/importance_sampling_ratio/max": 1.3372368812561035, "sampling/importance_sampling_ratio/mean": 0.25043170154094696, "sampling/importance_sampling_ratio/min": 6.087007648147846e-06, "sampling/sampling_logp_difference/max": 3.861280600229899, "sampling/sampling_logp_difference/mean": 0.004855277327199777, "step": 1950, "step_time": 10.526018810551614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2167.0, "completions/mean_length": 1449.96875, "completions/mean_terminated_length": 617.1078643798828, "completions/min_length": 111.5, "completions/min_terminated_length": 111.5, "entropy": 0.0231132872402668, "epoch": 0.23557692307692307, "frac_reward_zero_std": 0.0, "grad_norm": 0.0070088389329612255, "learning_rate": 7.645432692307691e-07, "loss": -0.0045, "num_tokens": 46791524.0, "reward": 0.7192057073116302, "reward_std": 0.24080779403448105, "rewards/reward_fn/mean": 0.7192057073116302, "rewards/reward_fn/std": 0.24080780148506165, "sampling/importance_sampling_ratio/max": 1.4720958471298218, "sampling/importance_sampling_ratio/mean": 0.3531622514128685, "sampling/importance_sampling_ratio/min": 8.67492453835439e-05, "sampling/sampling_logp_difference/max": 1.2765935063362122, "sampling/sampling_logp_difference/mean": 0.004733153851702809, "step": 1960, "step_time": 7.150758927036077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 3000.0, "completions/max_terminated_length": 1795.0, "completions/mean_length": 1415.75, "completions/mean_terminated_length": 337.46434529622394, "completions/min_length": 105.66666666666667, "completions/min_terminated_length": 105.66666666666667, "entropy": 0.021232820488512515, "epoch": 0.23677884615384615, "frac_reward_zero_std": 0.16666666666666666, "grad_norm": 0.00876674521714449, "learning_rate": 7.633413461538462e-07, "loss": -0.0056, "num_tokens": 47048004.0, "reward": 0.633433997631073, "reward_std": 0.3031046489874522, "rewards/reward_fn/mean": 0.633433997631073, "rewards/reward_fn/std": 0.3031046390533447, "sampling/importance_sampling_ratio/max": 1.377392093340556, "sampling/importance_sampling_ratio/mean": 0.4236486256122589, "sampling/importance_sampling_ratio/min": 2.6475454736403965e-05, "sampling/sampling_logp_difference/max": 2.7819604873657227, "sampling/sampling_logp_difference/mean": 0.004486140018949906, "step": 1970, "step_time": 10.341473854705692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 3000.0, "completions/max_terminated_length": 1843.0, "completions/mean_length": 1597.421875, "completions/mean_terminated_length": 572.4020385742188, "completions/min_length": 197.5, "completions/min_terminated_length": 197.5, "entropy": 0.022275352478027345, "epoch": 0.23798076923076922, "frac_reward_zero_std": 0.0, "grad_norm": 0.006115023512393236, "learning_rate": 7.62139423076923e-07, "loss": -0.0031, "num_tokens": 47214839.0, "reward": 0.6470248699188232, "reward_std": 0.2871789336204529, "rewards/reward_fn/mean": 0.6470248699188232, "rewards/reward_fn/std": 0.2871789336204529, "sampling/importance_sampling_ratio/max": 1.3578852415084839, "sampling/importance_sampling_ratio/mean": 0.1962711215019226, "sampling/importance_sampling_ratio/min": 2.4128640461640316e-05, "sampling/sampling_logp_difference/max": 4.756601810455322, "sampling/sampling_logp_difference/mean": 0.005161994136869907, "step": 1980, "step_time": 7.120637003518641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 3000.0, "completions/max_terminated_length": 1417.0, "completions/mean_length": 1271.96875, "completions/mean_terminated_length": 375.1177571614583, "completions/min_length": 109.33333333333333, "completions/min_terminated_length": 109.33333333333333, "entropy": 0.0188619339838624, "epoch": 0.23918269230769232, "frac_reward_zero_std": 0.16666666666666666, "grad_norm": 0.00801182258874178, "learning_rate": 7.609375e-07, "loss": -0.0047, "num_tokens": 47437068.0, "reward": 0.643988569577535, "reward_std": 0.29752257963021594, "rewards/reward_fn/mean": 0.643988569577535, "rewards/reward_fn/std": 0.29752254486083984, "sampling/importance_sampling_ratio/max": 1.345626990000407, "sampling/importance_sampling_ratio/mean": 0.4073835213979085, "sampling/importance_sampling_ratio/min": 0.00019277433481571885, "sampling/sampling_logp_difference/max": 2.474699378013611, "sampling/sampling_logp_difference/mean": 0.0039973332701871795, "step": 1990, "step_time": 10.2203319539316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 3000.0, "completions/max_terminated_length": 859.5, "completions/mean_length": 1248.8125, "completions/mean_terminated_length": 397.91229248046875, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.02333575636148453, "epoch": 0.2403846153846154, "frac_reward_zero_std": 0.0, "grad_norm": 0.010262195952236652, "learning_rate": 7.597355769230769e-07, "loss": -0.0064, "num_tokens": 47594464.0, "reward": 0.73726487159729, "reward_std": 0.2483183890581131, "rewards/reward_fn/mean": 0.73726487159729, "rewards/reward_fn/std": 0.2483183816075325, "sampling/importance_sampling_ratio/max": 1.0709866881370544, "sampling/importance_sampling_ratio/mean": 0.3480774015188217, "sampling/importance_sampling_ratio/min": 0.00014751507535493147, "sampling/sampling_logp_difference/max": 3.515850007534027, "sampling/sampling_logp_difference/mean": 0.005036287708207965, "step": 2000, "step_time": 7.062471184786409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3541666666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 2238.0, "completions/mean_length": 1477.5625, "completions/mean_terminated_length": 691.9385172526041, "completions/min_length": 128.66666666666666, "completions/min_terminated_length": 128.66666666666666, "entropy": 0.02247656639665365, "epoch": 0.24158653846153846, "frac_reward_zero_std": 0.0, "grad_norm": 0.004639199934899807, "learning_rate": 7.585336538461539e-07, "loss": -0.0038, "num_tokens": 47832902.0, "reward": 0.6375044385592142, "reward_std": 0.2961011826992035, "rewards/reward_fn/mean": 0.6375044385592142, "rewards/reward_fn/std": 0.296101172765096, "sampling/importance_sampling_ratio/max": 0.8318113684654236, "sampling/importance_sampling_ratio/mean": 0.23514928917090097, "sampling/importance_sampling_ratio/min": 0.00017589957618232196, "sampling/sampling_logp_difference/max": 3.571462551752726, "sampling/sampling_logp_difference/mean": 0.004521665318558614, "step": 2010, "step_time": 10.385367651470006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.515625, "completions/max_length": 3000.0, "completions/max_terminated_length": 2575.5, "completions/mean_length": 1870.828125, "completions/mean_terminated_length": 688.8547058105469, "completions/min_length": 139.5, "completions/min_terminated_length": 139.5, "entropy": 0.02058091126382351, "epoch": 0.24278846153846154, "frac_reward_zero_std": 0.0, "grad_norm": 0.0027600282337516546, "learning_rate": 7.573317307692307e-07, "loss": -0.0008, "num_tokens": 48033875.0, "reward": 0.5808525085449219, "reward_std": 0.2842250466346741, "rewards/reward_fn/mean": 0.5808525085449219, "rewards/reward_fn/std": 0.28422506153583527, "sampling/importance_sampling_ratio/max": 1.4440171718597412, "sampling/importance_sampling_ratio/mean": 0.24254721403121948, "sampling/importance_sampling_ratio/min": 3.1565215067530517e-05, "sampling/sampling_logp_difference/max": 2.539847731590271, "sampling/sampling_logp_difference/mean": 0.0042612741235643625, "step": 2020, "step_time": 7.22410007212311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3229166666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 2683.0, "completions/mean_length": 1410.78125, "completions/mean_terminated_length": 661.4827270507812, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.024636955931782722, "epoch": 0.2439903846153846, "frac_reward_zero_std": 0.0, "grad_norm": 0.023745182901620865, "learning_rate": 7.561298076923076e-07, "loss": 0.0005, "num_tokens": 48268774.0, "reward": 0.6471522847811381, "reward_std": 0.30149303873380023, "rewards/reward_fn/mean": 0.6471522847811381, "rewards/reward_fn/std": 0.3014930188655853, "sampling/importance_sampling_ratio/max": 1.0984151760737102, "sampling/importance_sampling_ratio/mean": 0.27774567405382794, "sampling/importance_sampling_ratio/min": 5.794481359918299e-07, "sampling/sampling_logp_difference/max": 8.011621157328287, "sampling/sampling_logp_difference/mean": 0.005071000506480535, "step": 2030, "step_time": 10.29387786919251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 3000.0, "completions/max_terminated_length": 1681.5, "completions/mean_length": 1392.84375, "completions/mean_terminated_length": 496.5410614013672, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.02370260711759329, "epoch": 0.24519230769230768, "frac_reward_zero_std": 0.125, "grad_norm": 0.00967030692845583, "learning_rate": 7.549278846153846e-07, "loss": -0.0052, "num_tokens": 48437292.0, "reward": 0.7128006815910339, "reward_std": 0.2641230821609497, "rewards/reward_fn/mean": 0.7128006815910339, "rewards/reward_fn/std": 0.2641230970621109, "sampling/importance_sampling_ratio/max": 1.8287623524665833, "sampling/importance_sampling_ratio/mean": 0.3044873923063278, "sampling/importance_sampling_ratio/min": 0.0008199424142958378, "sampling/sampling_logp_difference/max": 2.0772258043289185, "sampling/sampling_logp_difference/mean": 0.005147324409335852, "step": 2040, "step_time": 7.284801721852273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4583333333333333, "completions/max_length": 3000.0, "completions/max_terminated_length": 1786.3333333333333, "completions/mean_length": 1760.8125, "completions/mean_terminated_length": 726.7777913411459, "completions/min_length": 199.33333333333334, "completions/min_terminated_length": 199.33333333333334, "entropy": 0.02280855979770422, "epoch": 0.24639423076923078, "frac_reward_zero_std": 0.0, "grad_norm": 0.009827903471887112, "learning_rate": 7.537259615384614e-07, "loss": -0.0061, "num_tokens": 48712194.0, "reward": 0.6325724919637045, "reward_std": 0.26717649896939594, "rewards/reward_fn/mean": 0.6325724919637045, "rewards/reward_fn/std": 0.26717649896939594, "sampling/importance_sampling_ratio/max": 1.0688097675641377, "sampling/importance_sampling_ratio/mean": 0.2257056087255478, "sampling/importance_sampling_ratio/min": 0.00012615618470590562, "sampling/sampling_logp_difference/max": 2.908036231994629, "sampling/sampling_logp_difference/mean": 0.004821818011502425, "step": 2050, "step_time": 10.2231416660361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 3000.0, "completions/max_terminated_length": 2236.5, "completions/mean_length": 1485.390625, "completions/mean_terminated_length": 501.83335876464844, "completions/min_length": 103.5, "completions/min_terminated_length": 103.5, "entropy": 0.020781151950359344, "epoch": 0.24759615384615385, "frac_reward_zero_std": 0.125, "grad_norm": 0.003895982401445508, "learning_rate": 7.525240384615385e-07, "loss": -0.0004, "num_tokens": 48881451.0, "reward": 0.5067539811134338, "reward_std": 0.36260609328746796, "rewards/reward_fn/mean": 0.5067539811134338, "rewards/reward_fn/std": 0.36260607838630676, "sampling/importance_sampling_ratio/max": 1.034386157989502, "sampling/importance_sampling_ratio/mean": 0.2946948856115341, "sampling/importance_sampling_ratio/min": 7.464032591997238e-08, "sampling/sampling_logp_difference/max": 2.680044412612915, "sampling/sampling_logp_difference/mean": 0.0048369241412729025, "step": 2060, "step_time": 7.142410495318472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2493.0, "completions/mean_length": 1574.7083333333333, "completions/mean_terminated_length": 760.2555745442709, "completions/min_length": 162.66666666666666, "completions/min_terminated_length": 162.66666666666666, "entropy": 0.02263414692133665, "epoch": 0.24879807692307693, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.0030857212841510773, "learning_rate": 7.513221153846153e-07, "loss": 0.0011, "num_tokens": 49128799.0, "reward": 0.5817336638768514, "reward_std": 0.31289053956667584, "rewards/reward_fn/mean": 0.5817336638768514, "rewards/reward_fn/std": 0.31289054950078327, "sampling/importance_sampling_ratio/max": 0.9228005309899648, "sampling/importance_sampling_ratio/mean": 0.24728034436702728, "sampling/importance_sampling_ratio/min": 2.5066285767631296e-05, "sampling/sampling_logp_difference/max": 1.901768962542216, "sampling/sampling_logp_difference/mean": 0.004711613990366459, "step": 2070, "step_time": 10.343278066441417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 3000.0, "completions/max_terminated_length": 2704.0, "completions/mean_length": 1378.234375, "completions/mean_terminated_length": 694.08203125, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.02528456784784794, "epoch": 0.25, "frac_reward_zero_std": 0.0, "grad_norm": 0.003791192313656211, "learning_rate": 7.501201923076923e-07, "loss": -0.0032, "num_tokens": 49294542.0, "reward": 0.7262647151947021, "reward_std": 0.2538074404001236, "rewards/reward_fn/mean": 0.7262647151947021, "rewards/reward_fn/std": 0.2538074254989624, "sampling/importance_sampling_ratio/max": 1.4217981100082397, "sampling/importance_sampling_ratio/mean": 0.270192414522171, "sampling/importance_sampling_ratio/min": 0.00043615818503894843, "sampling/sampling_logp_difference/max": 1.4587246179580688, "sampling/sampling_logp_difference/mean": 0.005426641087979078, "step": 2080, "step_time": 7.11383582688868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3229166666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 1751.3333333333333, "completions/mean_length": 1291.2083333333333, "completions/mean_terminated_length": 491.43016560872394, "completions/min_length": 134.66666666666666, "completions/min_terminated_length": 134.66666666666666, "entropy": 0.0253146318718791, "epoch": 0.2512019230769231, "frac_reward_zero_std": 0.0, "grad_norm": 0.011274877935647964, "learning_rate": 7.489182692307692e-07, "loss": -0.0059, "num_tokens": 49527930.0, "reward": 0.7072197000185648, "reward_std": 0.260044405857722, "rewards/reward_fn/mean": 0.7072197000185648, "rewards/reward_fn/std": 0.2600443959236145, "sampling/importance_sampling_ratio/max": 1.7487555344899495, "sampling/importance_sampling_ratio/mean": 0.2939763168493907, "sampling/importance_sampling_ratio/min": 4.427085817345263e-05, "sampling/sampling_logp_difference/max": 3.2844557762145996, "sampling/sampling_logp_difference/mean": 0.005671370619287093, "step": 2090, "step_time": 10.383091604337096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 3000.0, "completions/max_terminated_length": 1104.5, "completions/mean_length": 1459.828125, "completions/mean_terminated_length": 473.1431121826172, "completions/min_length": 117.5, "completions/min_terminated_length": 117.5, "entropy": 0.021641460433602332, "epoch": 0.25240384615384615, "frac_reward_zero_std": 0.0, "grad_norm": 0.002464283024892211, "learning_rate": 7.477163461538461e-07, "loss": 0.0027, "num_tokens": 49685303.0, "reward": 0.6244517862796783, "reward_std": 0.2918383777141571, "rewards/reward_fn/mean": 0.6244517862796783, "rewards/reward_fn/std": 0.2918383777141571, "sampling/importance_sampling_ratio/max": 1.485206961631775, "sampling/importance_sampling_ratio/mean": 0.40390336513519287, "sampling/importance_sampling_ratio/min": 9.115009830651388e-05, "sampling/sampling_logp_difference/max": 3.967568516731262, "sampling/sampling_logp_difference/mean": 0.003965326119214296, "step": 2100, "step_time": 6.904691421799361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2036.3333333333333, "completions/mean_length": 1423.8541666666667, "completions/mean_terminated_length": 609.6734110514323, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.023536230064928533, "epoch": 0.2536057692307692, "frac_reward_zero_std": 0.0, "grad_norm": 0.005554532166570425, "learning_rate": 7.465144230769231e-07, "loss": 0.0113, "num_tokens": 49941897.0, "reward": 0.6497285763422648, "reward_std": 0.3236747036377589, "rewards/reward_fn/mean": 0.6497285763422648, "rewards/reward_fn/std": 0.3236746738354365, "sampling/importance_sampling_ratio/max": 1.3510222832361858, "sampling/importance_sampling_ratio/mean": 0.31998629371325177, "sampling/importance_sampling_ratio/min": 8.55933825126461e-05, "sampling/sampling_logp_difference/max": 1.7157063086827595, "sampling/sampling_logp_difference/mean": 0.004240198681751887, "step": 2110, "step_time": 10.507829964347184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 3000.0, "completions/max_terminated_length": 1700.5, "completions/mean_length": 1317.625, "completions/mean_terminated_length": 449.0801086425781, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.023352447710931302, "epoch": 0.2548076923076923, "frac_reward_zero_std": 0.125, "grad_norm": 0.01005303394049406, "learning_rate": 7.453125e-07, "loss": -0.002, "num_tokens": 50084297.0, "reward": 0.5881263017654419, "reward_std": 0.320743203163147, "rewards/reward_fn/mean": 0.5881263017654419, "rewards/reward_fn/std": 0.320743203163147, "sampling/importance_sampling_ratio/max": 1.324938714504242, "sampling/importance_sampling_ratio/mean": 0.35413944721221924, "sampling/importance_sampling_ratio/min": 5.2127608881846754e-05, "sampling/sampling_logp_difference/max": 1.8082903623580933, "sampling/sampling_logp_difference/mean": 0.004779822193086147, "step": 2120, "step_time": 7.212463407218456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2665.3333333333335, "completions/mean_length": 1407.9479166666667, "completions/mean_terminated_length": 677.8157348632812, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.02445549741387367, "epoch": 0.25600961538461536, "frac_reward_zero_std": 0.0, "grad_norm": 0.009541948325932026, "learning_rate": 7.441105769230768e-07, "loss": 0.0071, "num_tokens": 50330036.0, "reward": 0.7034512559572855, "reward_std": 0.27187936504681903, "rewards/reward_fn/mean": 0.7034512559572855, "rewards/reward_fn/std": 0.2718793749809265, "sampling/importance_sampling_ratio/max": 1.3622690439224243, "sampling/importance_sampling_ratio/mean": 0.27967455983161926, "sampling/importance_sampling_ratio/min": 6.834472211873314e-05, "sampling/sampling_logp_difference/max": 2.2079272667566934, "sampling/sampling_logp_difference/mean": 0.005140390091886123, "step": 2130, "step_time": 10.426290123537182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3000.0, "completions/max_terminated_length": 1698.0, "completions/mean_length": 1277.28125, "completions/mean_terminated_length": 739.8461608886719, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.02543333452194929, "epoch": 0.25721153846153844, "frac_reward_zero_std": 0.0, "grad_norm": 0.021380888298153877, "learning_rate": 7.429086538461539e-07, "loss": 0.0058, "num_tokens": 50482598.0, "reward": 0.7813950777053833, "reward_std": 0.2187374159693718, "rewards/reward_fn/mean": 0.7813950777053833, "rewards/reward_fn/std": 0.2187374085187912, "sampling/importance_sampling_ratio/max": 1.0091213881969452, "sampling/importance_sampling_ratio/mean": 0.28896428644657135, "sampling/importance_sampling_ratio/min": 3.206442670489196e-05, "sampling/sampling_logp_difference/max": 1.7485098242759705, "sampling/sampling_logp_difference/mean": 0.005064171506091952, "step": 2140, "step_time": 7.203683662507683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3958333333333333, "completions/max_length": 3000.0, "completions/max_terminated_length": 1654.3333333333333, "completions/mean_length": 1588.3020833333333, "completions/mean_terminated_length": 664.9816691080729, "completions/min_length": 169.66666666666666, "completions/min_terminated_length": 169.66666666666666, "entropy": 0.022798682376742364, "epoch": 0.25841346153846156, "frac_reward_zero_std": 0.0, "grad_norm": 0.0032890283036977053, "learning_rate": 7.417067307692307e-07, "loss": 0.0026, "num_tokens": 50740827.0, "reward": 0.6631722450256348, "reward_std": 0.26691244542598724, "rewards/reward_fn/mean": 0.6631722450256348, "rewards/reward_fn/std": 0.26691245039304096, "sampling/importance_sampling_ratio/max": 1.2380284468332927, "sampling/importance_sampling_ratio/mean": 0.2238804300626119, "sampling/importance_sampling_ratio/min": 0.00015998150335387132, "sampling/sampling_logp_difference/max": 2.4481306076049805, "sampling/sampling_logp_difference/mean": 0.004879165596018235, "step": 2150, "step_time": 10.439438015129417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 3000.0, "completions/max_terminated_length": 1974.0, "completions/mean_length": 1271.953125, "completions/mean_terminated_length": 547.0400085449219, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.023539925925433636, "epoch": 0.25961538461538464, "frac_reward_zero_std": 0.0, "grad_norm": 0.022054389119148254, "learning_rate": 7.405048076923076e-07, "loss": -0.0002, "num_tokens": 50895768.0, "reward": 0.6970775127410889, "reward_std": 0.26039406657218933, "rewards/reward_fn/mean": 0.6970775127410889, "rewards/reward_fn/std": 0.26039406657218933, "sampling/importance_sampling_ratio/max": 1.3879244923591614, "sampling/importance_sampling_ratio/mean": 0.3132207989692688, "sampling/importance_sampling_ratio/min": 2.654558937820184e-05, "sampling/sampling_logp_difference/max": 3.818634510040283, "sampling/sampling_logp_difference/mean": 0.005327915074303746, "step": 2160, "step_time": 7.185523380246013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 3000.0, "completions/max_terminated_length": 1545.6666666666667, "completions/mean_length": 1481.1145833333333, "completions/mean_terminated_length": 445.82940673828125, "completions/min_length": 121.66666666666667, "completions/min_terminated_length": 121.66666666666667, "entropy": 0.02027509789913893, "epoch": 0.2608173076923077, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.0021144028287380934, "learning_rate": 7.393028846153846e-07, "loss": 0.0099, "num_tokens": 51153667.0, "reward": 0.6624219218889872, "reward_std": 0.2820327083269755, "rewards/reward_fn/mean": 0.6624219218889872, "rewards/reward_fn/std": 0.2820327232281367, "sampling/importance_sampling_ratio/max": 1.3683290878931682, "sampling/importance_sampling_ratio/mean": 0.35977914929389954, "sampling/importance_sampling_ratio/min": 7.048129274759655e-05, "sampling/sampling_logp_difference/max": 2.0489989121754966, "sampling/sampling_logp_difference/mean": 0.004661675387372573, "step": 2170, "step_time": 10.3284527762793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1177.390625, "completions/mean_terminated_length": 623.2289733886719, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.024913838505744933, "epoch": 0.2620192307692308, "frac_reward_zero_std": 0.0, "grad_norm": 0.01226847991347313, "learning_rate": 7.381009615384615e-07, "loss": -0.0062, "num_tokens": 51296732.0, "reward": 0.7580423057079315, "reward_std": 0.25662118941545486, "rewards/reward_fn/mean": 0.7580423057079315, "rewards/reward_fn/std": 0.25662118196487427, "sampling/importance_sampling_ratio/max": 1.242958903312683, "sampling/importance_sampling_ratio/mean": 0.29217003285884857, "sampling/importance_sampling_ratio/min": 0.0012080967990186764, "sampling/sampling_logp_difference/max": 1.5168280601501465, "sampling/sampling_logp_difference/mean": 0.005294673377647996, "step": 2180, "step_time": 6.905082757212222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3333333333333333, "completions/max_length": 3000.0, "completions/max_terminated_length": 1515.0, "completions/mean_length": 1270.8541666666667, "completions/mean_terminated_length": 409.3998209635417, "completions/min_length": 128.66666666666666, "completions/min_terminated_length": 128.66666666666666, "entropy": 0.023746252059936523, "epoch": 0.26322115384615385, "frac_reward_zero_std": 0.0, "grad_norm": 0.01775032840669155, "learning_rate": 7.368990384615384e-07, "loss": 0.0005, "num_tokens": 51538414.0, "reward": 0.710773766040802, "reward_std": 0.2770939568678538, "rewards/reward_fn/mean": 0.710773766040802, "rewards/reward_fn/std": 0.27709393203258514, "sampling/importance_sampling_ratio/max": 1.408383806546529, "sampling/importance_sampling_ratio/mean": 0.3827118178208669, "sampling/importance_sampling_ratio/min": 0.00018417892048698073, "sampling/sampling_logp_difference/max": 1.4272955656051636, "sampling/sampling_logp_difference/mean": 0.0046650430498023825, "step": 2190, "step_time": 10.420082734897733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 3000.0, "completions/max_terminated_length": 2424.0, "completions/mean_length": 1616.484375, "completions/mean_terminated_length": 603.1333618164062, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.025040126778185366, "epoch": 0.2644230769230769, "frac_reward_zero_std": 0.0, "grad_norm": 0.00433649867773056, "learning_rate": 7.356971153846153e-07, "loss": -0.0046, "num_tokens": 51711981.0, "reward": 0.6862569749355316, "reward_std": 0.2741099148988724, "rewards/reward_fn/mean": 0.6862569749355316, "rewards/reward_fn/std": 0.2741098999977112, "sampling/importance_sampling_ratio/max": 1.7786496877670288, "sampling/importance_sampling_ratio/mean": 0.29103638231754303, "sampling/importance_sampling_ratio/min": 0.00011077895743483168, "sampling/sampling_logp_difference/max": 2.021602213382721, "sampling/sampling_logp_difference/mean": 0.00541699375025928, "step": 2200, "step_time": 7.207343152817339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4583333333333333, "completions/max_length": 3000.0, "completions/max_terminated_length": 2135.0, "completions/mean_length": 1720.09375, "completions/mean_terminated_length": 742.6574300130209, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.02576709557324648, "epoch": 0.265625, "frac_reward_zero_std": 0.0, "grad_norm": 0.0003971197293139994, "learning_rate": 7.344951923076923e-07, "loss": 0.0049, "num_tokens": 51980334.0, "reward": 0.6766280134518942, "reward_std": 0.2525593886772792, "rewards/reward_fn/mean": 0.6766280134518942, "rewards/reward_fn/std": 0.2525593886772792, "sampling/importance_sampling_ratio/max": 0.8176139791806539, "sampling/importance_sampling_ratio/mean": 0.21291539321343103, "sampling/importance_sampling_ratio/min": 1.8110460568247316e-05, "sampling/sampling_logp_difference/max": 3.420974334081014, "sampling/sampling_logp_difference/mean": 0.005080839308599631, "step": 2210, "step_time": 10.52579509196803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 3000.0, "completions/max_terminated_length": 1821.0, "completions/mean_length": 1813.1875, "completions/mean_terminated_length": 679.2423400878906, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.0214777410030365, "epoch": 0.2668269230769231, "frac_reward_zero_std": 0.0, "grad_norm": 0.005471117794513702, "learning_rate": 7.332932692307692e-07, "loss": 0.0007, "num_tokens": 52177594.0, "reward": 0.590294599533081, "reward_std": 0.3064478039741516, "rewards/reward_fn/mean": 0.590294599533081, "rewards/reward_fn/std": 0.3064478188753128, "sampling/importance_sampling_ratio/max": 1.8388278782367706, "sampling/importance_sampling_ratio/mean": 0.23462273180484772, "sampling/importance_sampling_ratio/min": 7.46080040698871e-05, "sampling/sampling_logp_difference/max": 2.399224042892456, "sampling/sampling_logp_difference/mean": 0.004599733976647258, "step": 2220, "step_time": 7.208472193963826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4895833333333333, "completions/max_length": 3000.0, "completions/max_terminated_length": 2495.6666666666665, "completions/mean_length": 1832.5208333333333, "completions/mean_terminated_length": 727.5730997721354, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.022440760023891927, "epoch": 0.26802884615384615, "frac_reward_zero_std": 0.0, "grad_norm": 0.0008288036333397031, "learning_rate": 7.320913461538462e-07, "loss": -0.0017, "num_tokens": 52476284.0, "reward": 0.6202168464660645, "reward_std": 0.24915648996829987, "rewards/reward_fn/mean": 0.6202168464660645, "rewards/reward_fn/std": 0.2491564800341924, "sampling/importance_sampling_ratio/max": 1.560708463191986, "sampling/importance_sampling_ratio/mean": 0.25328795115152997, "sampling/importance_sampling_ratio/min": 9.617264898527841e-06, "sampling/sampling_logp_difference/max": 4.068989992141724, "sampling/sampling_logp_difference/mean": 0.004811293600747983, "step": 2230, "step_time": 10.443929594848305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 3000.0, "completions/max_terminated_length": 1915.0, "completions/mean_length": 1109.53125, "completions/mean_terminated_length": 612.4750366210938, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.02575064357370138, "epoch": 0.2692307692307692, "frac_reward_zero_std": 0.0, "grad_norm": 0.013835358433425426, "learning_rate": 7.30889423076923e-07, "loss": -0.0042, "num_tokens": 52625086.0, "reward": 0.7844435572624207, "reward_std": 0.20911692827939987, "rewards/reward_fn/mean": 0.7844435572624207, "rewards/reward_fn/std": 0.20911692827939987, "sampling/importance_sampling_ratio/max": 1.7974565625190735, "sampling/importance_sampling_ratio/mean": 0.291825570166111, "sampling/importance_sampling_ratio/min": 0.00013223407341911297, "sampling/sampling_logp_difference/max": 3.116051137447357, "sampling/sampling_logp_difference/mean": 0.005740905646234751, "step": 2240, "step_time": 6.942650651279837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3854166666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 1407.6666666666667, "completions/mean_length": 1402.625, "completions/mean_terminated_length": 381.15822347005206, "completions/min_length": 123.66666666666667, "completions/min_terminated_length": 123.66666666666667, "entropy": 0.020567667484283448, "epoch": 0.2704326923076923, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.0024749389849603176, "learning_rate": 7.296875000000001e-07, "loss": -0.0043, "num_tokens": 52879370.0, "reward": 0.5598291456699371, "reward_std": 0.3145465850830078, "rewards/reward_fn/mean": 0.5598291456699371, "rewards/reward_fn/std": 0.31454657514890033, "sampling/importance_sampling_ratio/max": 1.5198476513226826, "sampling/importance_sampling_ratio/mean": 0.3789469401041667, "sampling/importance_sampling_ratio/min": 9.974674109495633e-05, "sampling/sampling_logp_difference/max": 2.4675304094950357, "sampling/sampling_logp_difference/mean": 0.004559226023654143, "step": 2250, "step_time": 10.383852160722018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 3000.0, "completions/max_terminated_length": 1875.5, "completions/mean_length": 1478.390625, "completions/mean_terminated_length": 573.2449645996094, "completions/min_length": 100.5, "completions/min_terminated_length": 100.5, "entropy": 0.02422373667359352, "epoch": 0.27163461538461536, "frac_reward_zero_std": 0.0, "grad_norm": 0.010370174422860146, "learning_rate": 7.284855769230769e-07, "loss": -0.0, "num_tokens": 53052507.0, "reward": 0.6776822209358215, "reward_std": 0.28089994192123413, "rewards/reward_fn/mean": 0.6776822209358215, "rewards/reward_fn/std": 0.28089992702007294, "sampling/importance_sampling_ratio/max": 0.9471602737903595, "sampling/importance_sampling_ratio/mean": 0.2802973836660385, "sampling/importance_sampling_ratio/min": 4.52261112116048e-06, "sampling/sampling_logp_difference/max": 3.049432933330536, "sampling/sampling_logp_difference/mean": 0.005280302604660392, "step": 2260, "step_time": 7.291759465169162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4270833333333333, "completions/max_length": 3000.0, "completions/max_terminated_length": 2003.3333333333333, "completions/mean_length": 1777.0104166666667, "completions/mean_terminated_length": 823.8778584798177, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.02159956693649292, "epoch": 0.27283653846153844, "frac_reward_zero_std": 0.0, "grad_norm": 0.00579427694901824, "learning_rate": 7.272836538461538e-07, "loss": 0.0026, "num_tokens": 53325932.0, "reward": 0.6934483846028646, "reward_std": 0.25739896794160205, "rewards/reward_fn/mean": 0.6934483846028646, "rewards/reward_fn/std": 0.25739896297454834, "sampling/importance_sampling_ratio/max": 1.3028364777565002, "sampling/importance_sampling_ratio/mean": 0.24943422277768454, "sampling/importance_sampling_ratio/min": 2.5241407305050718e-05, "sampling/sampling_logp_difference/max": 4.518181721369426, "sampling/sampling_logp_difference/mean": 0.004620525520294905, "step": 2270, "step_time": 10.375668446626515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 3000.0, "completions/max_terminated_length": 2240.0, "completions/mean_length": 1007.5, "completions/mean_terminated_length": 456.6939239501953, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.022196087799966336, "epoch": 0.27403846153846156, "frac_reward_zero_std": 0.125, "grad_norm": 0.010713969357311726, "learning_rate": 7.260817307692308e-07, "loss": 0.0034, "num_tokens": 53451724.0, "reward": 0.6500308215618134, "reward_std": 0.35757648944854736, "rewards/reward_fn/mean": 0.6500308215618134, "rewards/reward_fn/std": 0.35757648944854736, "sampling/importance_sampling_ratio/max": 1.1457477807998657, "sampling/importance_sampling_ratio/mean": 0.4403049349784851, "sampling/importance_sampling_ratio/min": 0.00023286174450731778, "sampling/sampling_logp_difference/max": 3.477167785167694, "sampling/sampling_logp_difference/mean": 0.004508649464696646, "step": 2280, "step_time": 6.892928269691765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4270833333333333, "completions/max_length": 3000.0, "completions/max_terminated_length": 1298.6666666666667, "completions/mean_length": 1513.6979166666667, "completions/mean_terminated_length": 400.65740966796875, "completions/min_length": 134.66666666666666, "completions/min_terminated_length": 134.66666666666666, "entropy": 0.022278699837625027, "epoch": 0.27524038461538464, "frac_reward_zero_std": 0.0, "grad_norm": 0.0048889522440731525, "learning_rate": 7.248798076923076e-07, "loss": 0.0046, "num_tokens": 53709079.0, "reward": 0.6132030089696249, "reward_std": 0.31066320339838666, "rewards/reward_fn/mean": 0.6132030089696249, "rewards/reward_fn/std": 0.31066320339838666, "sampling/importance_sampling_ratio/max": 1.36787744363149, "sampling/importance_sampling_ratio/mean": 0.3917072017987569, "sampling/importance_sampling_ratio/min": 8.03362566633344e-06, "sampling/sampling_logp_difference/max": 4.676920851071675, "sampling/sampling_logp_difference/mean": 0.0044599552638828754, "step": 2290, "step_time": 10.293658644519747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 3000.0, "completions/max_terminated_length": 2580.5, "completions/mean_length": 1366.84375, "completions/mean_terminated_length": 679.9464416503906, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.024043825455009937, "epoch": 0.2764423076923077, "frac_reward_zero_std": 0.0, "grad_norm": 0.009259712882339954, "learning_rate": 7.236778846153846e-07, "loss": 0.0079, "num_tokens": 53862725.0, "reward": 0.6089758574962616, "reward_std": 0.3548194468021393, "rewards/reward_fn/mean": 0.6089758574962616, "rewards/reward_fn/std": 0.3548194319009781, "sampling/importance_sampling_ratio/max": 1.4784797430038452, "sampling/importance_sampling_ratio/mean": 0.3383607715368271, "sampling/importance_sampling_ratio/min": 3.597013665057602e-05, "sampling/sampling_logp_difference/max": 1.639706313610077, "sampling/sampling_logp_difference/mean": 0.005168903851881623, "step": 2300, "step_time": 7.229374953266233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 3000.0, "completions/max_terminated_length": 1656.0, "completions/mean_length": 1325.25, "completions/mean_terminated_length": 560.9087524414062, "completions/min_length": 150.33333333333334, "completions/min_terminated_length": 150.33333333333334, "entropy": 0.026780489832162857, "epoch": 0.2776442307692308, "frac_reward_zero_std": 0.0, "grad_norm": 0.008393900468945503, "learning_rate": 7.224759615384615e-07, "loss": -0.0024, "num_tokens": 54093245.0, "reward": 0.6976022919019064, "reward_std": 0.2685917715231578, "rewards/reward_fn/mean": 0.6976022919019064, "rewards/reward_fn/std": 0.2685917814572652, "sampling/importance_sampling_ratio/max": 1.0693479577700298, "sampling/importance_sampling_ratio/mean": 0.2542320787906647, "sampling/importance_sampling_ratio/min": 0.00017272980767302215, "sampling/sampling_logp_difference/max": 1.5533508857091267, "sampling/sampling_logp_difference/mean": 0.005424777201066415, "step": 2310, "step_time": 10.295535723585635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2427.0, "completions/mean_length": 1673.484375, "completions/mean_terminated_length": 655.0292663574219, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.024499296955764294, "epoch": 0.27884615384615385, "frac_reward_zero_std": 0.0, "grad_norm": 0.011006653308868408, "learning_rate": 7.212740384615384e-07, "loss": -0.0033, "num_tokens": 54279756.0, "reward": 0.6529987454414368, "reward_std": 0.2508949190378189, "rewards/reward_fn/mean": 0.6529987454414368, "rewards/reward_fn/std": 0.2508949115872383, "sampling/importance_sampling_ratio/max": 1.0580652952194214, "sampling/importance_sampling_ratio/mean": 0.25575724244117737, "sampling/importance_sampling_ratio/min": 4.94421510666145e-06, "sampling/sampling_logp_difference/max": 20.827904880046844, "sampling/sampling_logp_difference/mean": 0.00502387061715126, "step": 2320, "step_time": 7.216917460504919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4166666666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 2118.3333333333335, "completions/mean_length": 1657.6770833333333, "completions/mean_terminated_length": 754.9524129231771, "completions/min_length": 172.33333333333334, "completions/min_terminated_length": 172.33333333333334, "entropy": 0.019440844282507898, "epoch": 0.2800480769230769, "frac_reward_zero_std": 0.0, "grad_norm": 0.008183019235730171, "learning_rate": 7.200721153846153e-07, "loss": 0.0065, "num_tokens": 54551949.0, "reward": 0.6342344085375468, "reward_std": 0.27784345547358197, "rewards/reward_fn/mean": 0.6342344085375468, "rewards/reward_fn/std": 0.2778434455394745, "sampling/importance_sampling_ratio/max": 1.0667417248090107, "sampling/importance_sampling_ratio/mean": 0.3175091991821925, "sampling/importance_sampling_ratio/min": 2.2969104065850843e-05, "sampling/sampling_logp_difference/max": 3.510993162790934, "sampling/sampling_logp_difference/mean": 0.004551070587088664, "step": 2330, "step_time": 10.367672433145344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 3000.0, "completions/max_terminated_length": 2782.0, "completions/mean_length": 1965.609375, "completions/mean_terminated_length": 969.543701171875, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "entropy": 0.020875727012753486, "epoch": 0.28125, "frac_reward_zero_std": 0.0, "grad_norm": 0.0032283668406307697, "learning_rate": 7.188701923076923e-07, "loss": -0.0022, "num_tokens": 54744188.0, "reward": 0.6077237129211426, "reward_std": 0.26869942992925644, "rewards/reward_fn/mean": 0.6077237129211426, "rewards/reward_fn/std": 0.26869941502809525, "sampling/importance_sampling_ratio/max": 1.8353022933006287, "sampling/importance_sampling_ratio/mean": 0.28598230332136154, "sampling/importance_sampling_ratio/min": 4.9017987066690694e-05, "sampling/sampling_logp_difference/max": 2.834478795528412, "sampling/sampling_logp_difference/mean": 0.004493229323998094, "step": 2340, "step_time": 7.11558348396793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2129.3333333333335, "completions/mean_length": 1361.3854166666667, "completions/mean_terminated_length": 611.7320658365885, "completions/min_length": 172.66666666666666, "completions/min_terminated_length": 172.66666666666666, "entropy": 0.02291214093565941, "epoch": 0.2824519230769231, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.00962124951183796, "learning_rate": 7.176682692307692e-07, "loss": -0.0019, "num_tokens": 54993665.0, "reward": 0.6719236771265665, "reward_std": 0.3106977740923564, "rewards/reward_fn/mean": 0.6719236771265665, "rewards/reward_fn/std": 0.3106977691253026, "sampling/importance_sampling_ratio/max": 1.1240987380345662, "sampling/importance_sampling_ratio/mean": 0.3190251290798187, "sampling/importance_sampling_ratio/min": 0.000337241103731382, "sampling/sampling_logp_difference/max": 3.3814264933268228, "sampling/sampling_logp_difference/mean": 0.004452207746605079, "step": 2350, "step_time": 10.049026822485029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2317.0, "completions/mean_length": 1398.265625, "completions/mean_terminated_length": 559.2619171142578, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.023130227997899055, "epoch": 0.28365384615384615, "frac_reward_zero_std": 0.0, "grad_norm": 0.004485600162297487, "learning_rate": 7.164663461538461e-07, "loss": 0.0005, "num_tokens": 55146618.0, "reward": 0.6490383148193359, "reward_std": 0.29789918661117554, "rewards/reward_fn/mean": 0.6490383148193359, "rewards/reward_fn/std": 0.29789917171001434, "sampling/importance_sampling_ratio/max": 1.2248769402503967, "sampling/importance_sampling_ratio/mean": 0.25870248675346375, "sampling/importance_sampling_ratio/min": 0.0001677620894042775, "sampling/sampling_logp_difference/max": 3.2039761543273926, "sampling/sampling_logp_difference/mean": 0.0053209285251796246, "step": 2360, "step_time": 7.106125849299133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2030.3333333333333, "completions/mean_length": 1381.3541666666667, "completions/mean_terminated_length": 502.5712890625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.0245378952473402, "epoch": 0.2848557692307692, "frac_reward_zero_std": 0.0, "grad_norm": 0.028105974197387695, "learning_rate": 7.15264423076923e-07, "loss": -0.0004, "num_tokens": 55383180.0, "reward": 0.7043775320053101, "reward_std": 0.24588686227798462, "rewards/reward_fn/mean": 0.7043775320053101, "rewards/reward_fn/std": 0.24588685234387717, "sampling/importance_sampling_ratio/max": 1.5458287000656128, "sampling/importance_sampling_ratio/mean": 0.3153531750043233, "sampling/importance_sampling_ratio/min": 6.667492319441711e-06, "sampling/sampling_logp_difference/max": 2.558857520421346, "sampling/sampling_logp_difference/mean": 0.005209847042957942, "step": 2370, "step_time": 10.252558021619915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2418.5, "completions/mean_length": 1350.1875, "completions/mean_terminated_length": 544.5711059570312, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.02513650320470333, "epoch": 0.2860576923076923, "frac_reward_zero_std": 0.0, "grad_norm": 0.0015031315851956606, "learning_rate": 7.140625e-07, "loss": -0.0051, "num_tokens": 55538520.0, "reward": 0.7058539986610413, "reward_std": 0.2674681171774864, "rewards/reward_fn/mean": 0.7058539986610413, "rewards/reward_fn/std": 0.2674681171774864, "sampling/importance_sampling_ratio/max": 1.2192511558532715, "sampling/importance_sampling_ratio/mean": 0.2742614597082138, "sampling/importance_sampling_ratio/min": 5.3045868526169215e-05, "sampling/sampling_logp_difference/max": 2.4388357400894165, "sampling/sampling_logp_difference/mean": 0.005659282673150301, "step": 2380, "step_time": 7.192421539872885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3333333333333333, "completions/max_length": 3000.0, "completions/max_terminated_length": 1869.0, "completions/mean_length": 1379.6666666666667, "completions/mean_terminated_length": 589.3117167154948, "completions/min_length": 94.66666666666667, "completions/min_terminated_length": 94.66666666666667, "entropy": 0.021850576251745225, "epoch": 0.28725961538461536, "frac_reward_zero_std": 0.0, "grad_norm": 0.006475917994976044, "learning_rate": 7.128605769230769e-07, "loss": 0.0033, "num_tokens": 55792976.0, "reward": 0.6938191652297974, "reward_std": 0.29230232040087384, "rewards/reward_fn/mean": 0.6938191652297974, "rewards/reward_fn/std": 0.2923023005326589, "sampling/importance_sampling_ratio/max": 1.160778800646464, "sampling/importance_sampling_ratio/mean": 0.3301597485939662, "sampling/importance_sampling_ratio/min": 5.702097913247902e-05, "sampling/sampling_logp_difference/max": 2.743065277735392, "sampling/sampling_logp_difference/mean": 0.0050287254465123015, "step": 2390, "step_time": 10.513286887668073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 3000.0, "completions/max_terminated_length": 1444.5, "completions/mean_length": 1671.828125, "completions/mean_terminated_length": 424.57904052734375, "completions/min_length": 86.5, "completions/min_terminated_length": 86.5, "entropy": 0.020652356371283532, "epoch": 0.28846153846153844, "frac_reward_zero_std": 0.125, "grad_norm": 0.03007565625011921, "learning_rate": 7.116586538461539e-07, "loss": -0.0067, "num_tokens": 55984325.0, "reward": 0.5638885498046875, "reward_std": 0.3233926296234131, "rewards/reward_fn/mean": 0.5638885498046875, "rewards/reward_fn/std": 0.3233925998210907, "sampling/importance_sampling_ratio/max": 1.6092809438705444, "sampling/importance_sampling_ratio/mean": 0.30680519342422485, "sampling/importance_sampling_ratio/min": 5.736954108215286e-06, "sampling/sampling_logp_difference/max": 3.380801200866699, "sampling/sampling_logp_difference/mean": 0.004343154141679406, "step": 2400, "step_time": 7.159383544418961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3541666666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 1932.6666666666667, "completions/mean_length": 1407.40625, "completions/mean_terminated_length": 541.966786702474, "completions/min_length": 144.33333333333334, "completions/min_terminated_length": 144.33333333333334, "entropy": 0.02542585674673319, "epoch": 0.28966346153846156, "frac_reward_zero_std": 0.0, "grad_norm": 0.0009999112226068974, "learning_rate": 7.104567307692307e-07, "loss": -0.0066, "num_tokens": 56233148.0, "reward": 0.7040546536445618, "reward_std": 0.25038671493530273, "rewards/reward_fn/mean": 0.7040546536445618, "rewards/reward_fn/std": 0.25038670500119525, "sampling/importance_sampling_ratio/max": 1.52249809106191, "sampling/importance_sampling_ratio/mean": 0.3379756013552348, "sampling/importance_sampling_ratio/min": 9.522159397571764e-06, "sampling/sampling_logp_difference/max": 5.28015152613322, "sampling/sampling_logp_difference/mean": 0.005141134684284528, "step": 2410, "step_time": 10.315823280345649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 3000.0, "completions/max_terminated_length": 2520.5, "completions/mean_length": 1229.328125, "completions/mean_terminated_length": 511.2449493408203, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.022818014584481718, "epoch": 0.29086538461538464, "frac_reward_zero_std": 0.125, "grad_norm": 0.006763150449842215, "learning_rate": 7.092548076923077e-07, "loss": -0.0019, "num_tokens": 56377449.0, "reward": 0.7518787384033203, "reward_std": 0.21488826721906662, "rewards/reward_fn/mean": 0.7518787384033203, "rewards/reward_fn/std": 0.21488825976848602, "sampling/importance_sampling_ratio/max": 1.2529589235782623, "sampling/importance_sampling_ratio/mean": 0.4128211736679077, "sampling/importance_sampling_ratio/min": 0.00022620358981839672, "sampling/sampling_logp_difference/max": 1.944908857345581, "sampling/sampling_logp_difference/mean": 0.0050936853513121605, "step": 2420, "step_time": 7.057852476555854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3645833333333333, "completions/max_length": 3000.0, "completions/max_terminated_length": 2067.6666666666665, "completions/mean_length": 1457.125, "completions/mean_terminated_length": 578.2462972005209, "completions/min_length": 97.33333333333333, "completions/min_terminated_length": 97.33333333333333, "entropy": 0.01900071557611227, "epoch": 0.2920673076923077, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.0028726686723530293, "learning_rate": 7.080528846153846e-07, "loss": 0.0053, "num_tokens": 56614965.0, "reward": 0.552087813615799, "reward_std": 0.3065100610256195, "rewards/reward_fn/mean": 0.552087813615799, "rewards/reward_fn/std": 0.306510051091512, "sampling/importance_sampling_ratio/max": 1.5222391684850056, "sampling/importance_sampling_ratio/mean": 0.3896249483029048, "sampling/importance_sampling_ratio/min": 1.1846683322198483e-05, "sampling/sampling_logp_difference/max": 3.806856155395508, "sampling/sampling_logp_difference/mean": 0.003959737252444029, "step": 2430, "step_time": 10.40785576235503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2266.5, "completions/mean_length": 1336.828125, "completions/mean_terminated_length": 562.8824920654297, "completions/min_length": 111.5, "completions/min_terminated_length": 111.5, "entropy": 0.02079582316800952, "epoch": 0.2932692307692308, "frac_reward_zero_std": 0.0, "grad_norm": 0.0104856351390481, "learning_rate": 7.068509615384614e-07, "loss": 0.0009, "num_tokens": 56774090.0, "reward": 0.6142079830169678, "reward_std": 0.3352607637643814, "rewards/reward_fn/mean": 0.6142079830169678, "rewards/reward_fn/std": 0.3352607488632202, "sampling/importance_sampling_ratio/max": 1.0899843573570251, "sampling/importance_sampling_ratio/mean": 0.2888922542333603, "sampling/importance_sampling_ratio/min": 0.0008091583562901405, "sampling/sampling_logp_difference/max": 1.8966386318206787, "sampling/sampling_logp_difference/mean": 0.00455974112264812, "step": 2440, "step_time": 7.082296597026288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 3000.0, "completions/max_terminated_length": 1904.6666666666667, "completions/mean_length": 1159.0729166666667, "completions/mean_terminated_length": 510.53175862630206, "completions/min_length": 160.66666666666666, "completions/min_terminated_length": 160.66666666666666, "entropy": 0.023481736518442632, "epoch": 0.29447115384615385, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.0022590530570596457, "learning_rate": 7.056490384615385e-07, "loss": -0.0051, "num_tokens": 56979137.0, "reward": 0.704776922861735, "reward_std": 0.24083746472994486, "rewards/reward_fn/mean": 0.704776922861735, "rewards/reward_fn/std": 0.24083747466405234, "sampling/importance_sampling_ratio/max": 1.5576691230138142, "sampling/importance_sampling_ratio/mean": 0.38088276982307434, "sampling/importance_sampling_ratio/min": 0.00030510378170826397, "sampling/sampling_logp_difference/max": 1.9104285438855488, "sampling/sampling_logp_difference/mean": 0.005014153818289439, "step": 2450, "step_time": 10.014582949783653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2151.0, "completions/mean_length": 1318.90625, "completions/mean_terminated_length": 571.0470123291016, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.024564348720014097, "epoch": 0.2956730769230769, "frac_reward_zero_std": 0.0, "grad_norm": 0.003123306902125478, "learning_rate": 7.044471153846153e-07, "loss": -0.0041, "num_tokens": 57135251.0, "reward": 0.6996957063674927, "reward_std": 0.2601563110947609, "rewards/reward_fn/mean": 0.6996957063674927, "rewards/reward_fn/std": 0.2601563110947609, "sampling/importance_sampling_ratio/max": 1.7859579920768738, "sampling/importance_sampling_ratio/mean": 0.3322088122367859, "sampling/importance_sampling_ratio/min": 0.00023511469225923065, "sampling/sampling_logp_difference/max": 2.150243639945984, "sampling/sampling_logp_difference/mean": 0.005182928871363401, "step": 2460, "step_time": 7.049402061197907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3020833333333333, "completions/max_length": 3000.0, "completions/max_terminated_length": 2208.6666666666665, "completions/mean_length": 1330.1666666666667, "completions/mean_terminated_length": 641.1191609700521, "completions/min_length": 147.33333333333334, "completions/min_terminated_length": 147.33333333333334, "entropy": 0.025427759811282157, "epoch": 0.296875, "frac_reward_zero_std": 0.0, "grad_norm": 0.011993623338639736, "learning_rate": 7.032451923076923e-07, "loss": -0.001, "num_tokens": 57366947.0, "reward": 0.7011173963546753, "reward_std": 0.2964772582054138, "rewards/reward_fn/mean": 0.7011173963546753, "rewards/reward_fn/std": 0.2964772383371989, "sampling/importance_sampling_ratio/max": 1.6654469966888428, "sampling/importance_sampling_ratio/mean": 0.3277270197868347, "sampling/importance_sampling_ratio/min": 0.00012231622895342298, "sampling/sampling_logp_difference/max": 3.1074304580688477, "sampling/sampling_logp_difference/mean": 0.00524534285068512, "step": 2470, "step_time": 10.248499845713377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2750.5, "completions/mean_length": 1890.015625, "completions/mean_terminated_length": 858.0056457519531, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "entropy": 0.026715070381760596, "epoch": 0.2980769230769231, "frac_reward_zero_std": 0.0, "grad_norm": 0.0007672143983654678, "learning_rate": 7.020432692307692e-07, "loss": -0.0093, "num_tokens": 57546340.0, "reward": 0.6472473442554474, "reward_std": 0.26266518235206604, "rewards/reward_fn/mean": 0.6472473442554474, "rewards/reward_fn/std": 0.26266516745090485, "sampling/importance_sampling_ratio/max": 2.114471197128296, "sampling/importance_sampling_ratio/mean": 0.20331062376499176, "sampling/importance_sampling_ratio/min": 1.9806984028036823e-05, "sampling/sampling_logp_difference/max": 2.167837381362915, "sampling/sampling_logp_difference/mean": 0.005830670706927776, "step": 2480, "step_time": 7.167404567077756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.17708333333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 1857.6666666666667, "completions/mean_length": 967.0104166666666, "completions/mean_terminated_length": 521.7696634928385, "completions/min_length": 132.66666666666666, "completions/min_terminated_length": 132.66666666666666, "entropy": 0.02020426411181688, "epoch": 0.29927884615384615, "frac_reward_zero_std": 0.0, "grad_norm": 0.007853524759411812, "learning_rate": 7.008413461538462e-07, "loss": 0.0029, "num_tokens": 57733957.0, "reward": 0.7057060996691386, "reward_std": 0.31171684463818866, "rewards/reward_fn/mean": 0.7057060996691386, "rewards/reward_fn/std": 0.31171682476997375, "sampling/importance_sampling_ratio/max": 1.6292778650919597, "sampling/importance_sampling_ratio/mean": 0.44232377409935, "sampling/importance_sampling_ratio/min": 0.00045912942247620475, "sampling/sampling_logp_difference/max": 2.183618982632955, "sampling/sampling_logp_difference/mean": 0.0045942106905082864, "step": 2490, "step_time": 10.216272149048745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2632.5, "completions/mean_length": 1283.734375, "completions/mean_terminated_length": 778.4583435058594, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.024301193468272687, "epoch": 0.3004807692307692, "frac_reward_zero_std": 0.0, "grad_norm": 0.019290300086140633, "learning_rate": 6.99639423076923e-07, "loss": -0.0043, "num_tokens": 57888052.0, "reward": 0.7471644878387451, "reward_std": 0.260383240878582, "rewards/reward_fn/mean": 0.7471644878387451, "rewards/reward_fn/std": 0.2603832259774208, "sampling/importance_sampling_ratio/max": 1.267920970916748, "sampling/importance_sampling_ratio/mean": 0.2693132311105728, "sampling/importance_sampling_ratio/min": 7.0168871388887055e-06, "sampling/sampling_logp_difference/max": 2.2354648113250732, "sampling/sampling_logp_difference/mean": 0.005360081326216459, "step": 2500, "step_time": 6.930619219318032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.20833333333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 1675.0, "completions/mean_length": 1027.8333333333333, "completions/mean_terminated_length": 525.7637125651041, "completions/min_length": 103.33333333333333, "completions/min_terminated_length": 103.33333333333333, "entropy": 0.020756184495985507, "epoch": 0.3016826923076923, "frac_reward_zero_std": 0.0, "grad_norm": 0.007989599369466305, "learning_rate": 6.984375e-07, "loss": 0.0248, "num_tokens": 58068204.0, "reward": 0.7807705799738566, "reward_std": 0.21655613680680594, "rewards/reward_fn/mean": 0.7807705799738566, "rewards/reward_fn/std": 0.21655613680680594, "sampling/importance_sampling_ratio/max": 1.5491312344868977, "sampling/importance_sampling_ratio/mean": 0.4364718695481618, "sampling/importance_sampling_ratio/min": 0.00032232675554647966, "sampling/sampling_logp_difference/max": 2.753358523050944, "sampling/sampling_logp_difference/mean": 0.004918592981994152, "step": 2510, "step_time": 10.211592224333435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 3000.0, "completions/max_terminated_length": 1821.5, "completions/mean_length": 1227.671875, "completions/mean_terminated_length": 479.6383514404297, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.02064874432981014, "epoch": 0.30288461538461536, "frac_reward_zero_std": 0.125, "grad_norm": 0.013839450664818287, "learning_rate": 6.972355769230769e-07, "loss": -0.0003, "num_tokens": 58202583.0, "reward": 0.7415445446968079, "reward_std": 0.23331579566001892, "rewards/reward_fn/mean": 0.7415445446968079, "rewards/reward_fn/std": 0.23331577330827713, "sampling/importance_sampling_ratio/max": 1.4557114839553833, "sampling/importance_sampling_ratio/mean": 0.40845997631549835, "sampling/importance_sampling_ratio/min": 0.0006763396741007455, "sampling/sampling_logp_difference/max": 1.4904084503650665, "sampling/sampling_logp_difference/mean": 0.004573354730382562, "step": 2520, "step_time": 7.0738221146166325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.16666666666666666, "completions/max_length": 2846.0, "completions/max_terminated_length": 2588.0, "completions/mean_length": 963.1979166666666, "completions/mean_terminated_length": 573.084706624349, "completions/min_length": 127.66666666666667, "completions/min_terminated_length": 127.66666666666667, "entropy": 0.021901669539511205, "epoch": 0.30408653846153844, "frac_reward_zero_std": 0.16666666666666666, "grad_norm": 0.011012652888894081, "learning_rate": 6.960336538461538e-07, "loss": 0.0014, "num_tokens": 58399378.0, "reward": 0.7352368235588074, "reward_std": 0.284562423825264, "rewards/reward_fn/mean": 0.7352368235588074, "rewards/reward_fn/std": 0.2845624138911565, "sampling/importance_sampling_ratio/max": 1.229049066702525, "sampling/importance_sampling_ratio/mean": 0.4347311506668727, "sampling/importance_sampling_ratio/min": 0.00031060796512368444, "sampling/sampling_logp_difference/max": 1.7081786394119263, "sampling/sampling_logp_difference/mean": 0.00450775360999008, "step": 2530, "step_time": 9.405714964773505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 3000.0, "completions/max_terminated_length": 2474.0, "completions/mean_length": 1633.390625, "completions/mean_terminated_length": 744.7981567382812, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.02234210344031453, "epoch": 0.30528846153846156, "frac_reward_zero_std": 0.0, "grad_norm": 0.003966109827160835, "learning_rate": 6.948317307692308e-07, "loss": -0.0015, "num_tokens": 58572011.0, "reward": 0.6652782559394836, "reward_std": 0.2914584130048752, "rewards/reward_fn/mean": 0.6652782559394836, "rewards/reward_fn/std": 0.2914584130048752, "sampling/importance_sampling_ratio/max": 0.8540104031562805, "sampling/importance_sampling_ratio/mean": 0.1833297461271286, "sampling/importance_sampling_ratio/min": 1.3728541861213195e-05, "sampling/sampling_logp_difference/max": 5.447339713573456, "sampling/sampling_logp_difference/mean": 0.005124102812260389, "step": 2540, "step_time": 7.073184914793819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 3000.0, "completions/max_terminated_length": 1391.6666666666667, "completions/mean_length": 1528.8541666666667, "completions/mean_terminated_length": 416.1230214436849, "completions/min_length": 141.33333333333334, "completions/min_terminated_length": 141.33333333333334, "entropy": 0.021175825595855714, "epoch": 0.30649038461538464, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.015251124277710915, "learning_rate": 6.936298076923076e-07, "loss": -0.0044, "num_tokens": 58820373.0, "reward": 0.6382591525713602, "reward_std": 0.2961338261763255, "rewards/reward_fn/mean": 0.6382591525713602, "rewards/reward_fn/std": 0.2961338361104329, "sampling/importance_sampling_ratio/max": 1.7008673350016277, "sampling/importance_sampling_ratio/mean": 0.40536653498808545, "sampling/importance_sampling_ratio/min": 2.5232890038751066e-05, "sampling/sampling_logp_difference/max": 2.4001962741216025, "sampling/sampling_logp_difference/mean": 0.004421724549805124, "step": 2550, "step_time": 10.503717879857868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 3000.0, "completions/max_terminated_length": 1912.5, "completions/mean_length": 1018.8125, "completions/mean_terminated_length": 411.3716735839844, "completions/min_length": 119.5, "completions/min_terminated_length": 119.5, "entropy": 0.022008635476231574, "epoch": 0.3076923076923077, "frac_reward_zero_std": 0.0, "grad_norm": 0.018862491473555565, "learning_rate": 6.924278846153846e-07, "loss": 0.0008, "num_tokens": 58949393.0, "reward": 0.7478460967540741, "reward_std": 0.24976656585931778, "rewards/reward_fn/mean": 0.7478460967540741, "rewards/reward_fn/std": 0.24976658076047897, "sampling/importance_sampling_ratio/max": 1.434495985507965, "sampling/importance_sampling_ratio/mean": 0.43471428751945496, "sampling/importance_sampling_ratio/min": 7.701870708842762e-05, "sampling/sampling_logp_difference/max": 3.658822774887085, "sampling/sampling_logp_difference/mean": 0.005011367844417691, "step": 2560, "step_time": 7.022438845969736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4583333333333333, "completions/max_length": 3000.0, "completions/max_terminated_length": 2316.0, "completions/mean_length": 1666.0104166666667, "completions/mean_terminated_length": 567.1122029622396, "completions/min_length": 171.33333333333334, "completions/min_terminated_length": 171.33333333333334, "entropy": 0.022210644371807576, "epoch": 0.3088942307692308, "frac_reward_zero_std": 0.0, "grad_norm": 0.0030874686781316996, "learning_rate": 6.912259615384615e-07, "loss": 0.0055, "num_tokens": 59233874.0, "reward": 0.6024896701176962, "reward_std": 0.3036172688007355, "rewards/reward_fn/mean": 0.6024896701176962, "rewards/reward_fn/std": 0.303617258866628, "sampling/importance_sampling_ratio/max": 1.804792006810506, "sampling/importance_sampling_ratio/mean": 0.278071753680706, "sampling/importance_sampling_ratio/min": 3.023966140366004e-05, "sampling/sampling_logp_difference/max": 2.790013154347738, "sampling/sampling_logp_difference/mean": 0.004430186624328296, "step": 2570, "step_time": 10.46717821341008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2046.5, "completions/mean_length": 1137.59375, "completions/mean_terminated_length": 564.2383422851562, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.021702189929783344, "epoch": 0.31009615384615385, "frac_reward_zero_std": 0.125, "grad_norm": 0.01513916626572609, "learning_rate": 6.900240384615384e-07, "loss": -0.0021, "num_tokens": 59377328.0, "reward": 0.6795879006385803, "reward_std": 0.2873607873916626, "rewards/reward_fn/mean": 0.6795879006385803, "rewards/reward_fn/std": 0.287360779941082, "sampling/importance_sampling_ratio/max": 1.1054389476776123, "sampling/importance_sampling_ratio/mean": 0.3910888582468033, "sampling/importance_sampling_ratio/min": 9.144051637122175e-05, "sampling/sampling_logp_difference/max": 2.3236443996429443, "sampling/sampling_logp_difference/mean": 0.00459505314938724, "step": 2580, "step_time": 7.0132126190699635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2604166666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 1837.6666666666667, "completions/mean_length": 1167.8125, "completions/mean_terminated_length": 520.3852945963541, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.0201058279722929, "epoch": 0.3112980769230769, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.009968595579266548, "learning_rate": 6.888221153846154e-07, "loss": -0.0025, "num_tokens": 59598478.0, "reward": 0.6385871569315592, "reward_std": 0.32590152819951373, "rewards/reward_fn/mean": 0.6385871569315592, "rewards/reward_fn/std": 0.32590152819951373, "sampling/importance_sampling_ratio/max": 1.9505271911621094, "sampling/importance_sampling_ratio/mean": 0.46268848578135174, "sampling/importance_sampling_ratio/min": 0.00019972455606875883, "sampling/sampling_logp_difference/max": 2.157841682434082, "sampling/sampling_logp_difference/mean": 0.00473470613360405, "step": 2590, "step_time": 10.544604310486466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 3000.0, "completions/max_terminated_length": 1956.5, "completions/mean_length": 1421.3125, "completions/mean_terminated_length": 474.1000061035156, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.020858528278768062, "epoch": 0.3125, "frac_reward_zero_std": 0.125, "grad_norm": 0.008234012871980667, "learning_rate": 6.876201923076923e-07, "loss": 0.0042, "num_tokens": 59748330.0, "reward": 0.5669319927692413, "reward_std": 0.33046291768550873, "rewards/reward_fn/mean": 0.5669319927692413, "rewards/reward_fn/std": 0.33046291768550873, "sampling/importance_sampling_ratio/max": 1.0850653946399689, "sampling/importance_sampling_ratio/mean": 0.4071808159351349, "sampling/importance_sampling_ratio/min": 0.0002372109993302729, "sampling/sampling_logp_difference/max": 1.855405569076538, "sampling/sampling_logp_difference/mean": 0.0039479241240769625, "step": 2600, "step_time": 7.0136195855215195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2370.6666666666665, "completions/mean_length": 1547.125, "completions/mean_terminated_length": 699.3502197265625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.02146148830652237, "epoch": 0.3137019230769231, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.009639219380915165, "learning_rate": 6.864182692307691e-07, "loss": -0.0041, "num_tokens": 60012286.0, "reward": 0.7109952569007874, "reward_std": 0.23561683793862662, "rewards/reward_fn/mean": 0.7109952569007874, "rewards/reward_fn/std": 0.23561683297157288, "sampling/importance_sampling_ratio/max": 1.7070880929629009, "sampling/importance_sampling_ratio/mean": 0.3366934011379878, "sampling/importance_sampling_ratio/min": 5.924124964925189e-05, "sampling/sampling_logp_difference/max": 2.378955284754435, "sampling/sampling_logp_difference/mean": 0.0049744971717397375, "step": 2610, "step_time": 10.2751808013767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 3000.0, "completions/max_terminated_length": 2287.0, "completions/mean_length": 1659.96875, "completions/mean_terminated_length": 742.6055908203125, "completions/min_length": 154.5, "completions/min_terminated_length": 154.5, "entropy": 0.0200947736389935, "epoch": 0.31490384615384615, "frac_reward_zero_std": 0.0, "grad_norm": 0.002984852297231555, "learning_rate": 6.852163461538462e-07, "loss": 0.0116, "num_tokens": 60188620.0, "reward": 0.6690588891506195, "reward_std": 0.2689304053783417, "rewards/reward_fn/mean": 0.6690588891506195, "rewards/reward_fn/std": 0.2689303904771805, "sampling/importance_sampling_ratio/max": 1.803581714630127, "sampling/importance_sampling_ratio/mean": 0.2974337339401245, "sampling/importance_sampling_ratio/min": 1.0920879276454798e-06, "sampling/sampling_logp_difference/max": 1.9769444465637207, "sampling/sampling_logp_difference/mean": 0.004877972416579723, "step": 2620, "step_time": 7.3687951965257525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3541666666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 2492.0, "completions/mean_length": 1509.3229166666667, "completions/mean_terminated_length": 676.9148254394531, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.02398980427533388, "epoch": 0.3161057692307692, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.004511606879532337, "learning_rate": 6.84014423076923e-07, "loss": -0.0062, "num_tokens": 60448179.0, "reward": 0.6732983787854513, "reward_std": 0.27331798275311786, "rewards/reward_fn/mean": 0.6732983787854513, "rewards/reward_fn/std": 0.27331798275311786, "sampling/importance_sampling_ratio/max": 1.663867672284444, "sampling/importance_sampling_ratio/mean": 0.3100566814343135, "sampling/importance_sampling_ratio/min": 1.529505828064733e-05, "sampling/sampling_logp_difference/max": 3.456721385320028, "sampling/sampling_logp_difference/mean": 0.005149587833633025, "step": 2630, "step_time": 10.545946866739541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 3000.0, "completions/max_terminated_length": 1671.0, "completions/mean_length": 1449.265625, "completions/mean_terminated_length": 399.1903533935547, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.021947317011654376, "epoch": 0.3173076923076923, "frac_reward_zero_std": 0.25, "grad_norm": 0.003182592336088419, "learning_rate": 6.828125e-07, "loss": 0.0155, "num_tokens": 60602012.0, "reward": 0.6929268538951874, "reward_std": 0.2718895673751831, "rewards/reward_fn/mean": 0.6929268538951874, "rewards/reward_fn/std": 0.2718895524740219, "sampling/importance_sampling_ratio/max": 1.3968918323516846, "sampling/importance_sampling_ratio/mean": 0.38471804559230804, "sampling/importance_sampling_ratio/min": 1.3193950508139096e-05, "sampling/sampling_logp_difference/max": 3.9099990129470825, "sampling/sampling_logp_difference/mean": 0.00429053558036685, "step": 2640, "step_time": 7.081026136688888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3541666666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 2052.0, "completions/mean_length": 1451.59375, "completions/mean_terminated_length": 567.4925435384115, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.01953637655824423, "epoch": 0.31850961538461536, "frac_reward_zero_std": 0.0, "grad_norm": 0.009035379625856876, "learning_rate": 6.816105769230769e-07, "loss": -0.0043, "num_tokens": 60843181.0, "reward": 0.6724948485692342, "reward_std": 0.2905808488527934, "rewards/reward_fn/mean": 0.6724948485692342, "rewards/reward_fn/std": 0.2905808389186859, "sampling/importance_sampling_ratio/max": 1.4895813465118408, "sampling/importance_sampling_ratio/mean": 0.3555075029532115, "sampling/importance_sampling_ratio/min": 5.601077305072977e-05, "sampling/sampling_logp_difference/max": 3.0457966725031533, "sampling/sampling_logp_difference/mean": 0.004175575915724039, "step": 2650, "step_time": 10.441791606415062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2402.5, "completions/mean_length": 1401.21875, "completions/mean_terminated_length": 521.3574981689453, "completions/min_length": 126.5, "completions/min_terminated_length": 126.5, "entropy": 0.02022392824292183, "epoch": 0.31971153846153844, "frac_reward_zero_std": 0.125, "grad_norm": 0.019442804157733917, "learning_rate": 6.804086538461539e-07, "loss": -0.004, "num_tokens": 60999211.0, "reward": 0.652643233537674, "reward_std": 0.27523622661828995, "rewards/reward_fn/mean": 0.652643233537674, "rewards/reward_fn/std": 0.27523621916770935, "sampling/importance_sampling_ratio/max": 1.4661728739738464, "sampling/importance_sampling_ratio/mean": 0.4058280438184738, "sampling/importance_sampling_ratio/min": 0.00037329854512790916, "sampling/sampling_logp_difference/max": 2.2816264629364014, "sampling/sampling_logp_difference/mean": 0.004084443673491478, "step": 2660, "step_time": 7.023004062473774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.20833333333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 2484.3333333333335, "completions/mean_length": 955.6666666666666, "completions/mean_terminated_length": 418.9147033691406, "completions/min_length": 127.33333333333333, "completions/min_terminated_length": 127.33333333333333, "entropy": 0.02477583158761263, "epoch": 0.32091346153846156, "frac_reward_zero_std": 0.0, "grad_norm": 0.009081405587494373, "learning_rate": 6.792067307692307e-07, "loss": -0.0055, "num_tokens": 61197315.0, "reward": 0.7498939236005148, "reward_std": 0.25310619672139484, "rewards/reward_fn/mean": 0.7498939236005148, "rewards/reward_fn/std": 0.2531062066555023, "sampling/importance_sampling_ratio/max": 1.794108271598816, "sampling/importance_sampling_ratio/mean": 0.47360177834828693, "sampling/importance_sampling_ratio/min": 0.0003416417549715334, "sampling/sampling_logp_difference/max": 2.3949012756347656, "sampling/sampling_logp_difference/mean": 0.005358665715903044, "step": 2670, "step_time": 10.312116410583258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2040.5, "completions/mean_length": 1392.203125, "completions/mean_terminated_length": 666.4019470214844, "completions/min_length": 128.5, "completions/min_terminated_length": 128.5, "entropy": 0.022008752822875975, "epoch": 0.32211538461538464, "frac_reward_zero_std": 0.125, "grad_norm": 0.007232086267322302, "learning_rate": 6.780048076923076e-07, "loss": 0.0014, "num_tokens": 61366032.0, "reward": 0.5997923910617828, "reward_std": 0.3712899386882782, "rewards/reward_fn/mean": 0.5997923910617828, "rewards/reward_fn/std": 0.3712899386882782, "sampling/importance_sampling_ratio/max": 1.2389085292816162, "sampling/importance_sampling_ratio/mean": 0.38568253815174103, "sampling/importance_sampling_ratio/min": 1.4343946475037228e-06, "sampling/sampling_logp_difference/max": 4.607123374938965, "sampling/sampling_logp_difference/mean": 0.004591244738548994, "step": 2680, "step_time": 7.083099307026714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.16666666666666666, "completions/max_length": 3000.0, "completions/max_terminated_length": 2412.6666666666665, "completions/mean_length": 894.0833333333334, "completions/mean_terminated_length": 476.43201700846356, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.0229037381708622, "epoch": 0.3233173076923077, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.014745969325304031, "learning_rate": 6.768028846153846e-07, "loss": 0.0085, "num_tokens": 61559144.0, "reward": 0.8244989911715189, "reward_std": 0.17986035098632178, "rewards/reward_fn/mean": 0.8244989911715189, "rewards/reward_fn/std": 0.1798603410522143, "sampling/importance_sampling_ratio/max": 1.5572772820790608, "sampling/importance_sampling_ratio/mean": 0.5031811396280924, "sampling/importance_sampling_ratio/min": 1.4945135565843278e-05, "sampling/sampling_logp_difference/max": 2.3666778405507407, "sampling/sampling_logp_difference/mean": 0.005499154484520356, "step": 2690, "step_time": 9.827028383500874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2403.0, "completions/mean_length": 985.390625, "completions/mean_terminated_length": 476.77020263671875, "completions/min_length": 98.5, "completions/min_terminated_length": 98.5, "entropy": 0.026623833179473876, "epoch": 0.3245192307692308, "frac_reward_zero_std": 0.0, "grad_norm": 0.010271778330206871, "learning_rate": 6.756009615384614e-07, "loss": -0.0096, "num_tokens": 61700217.0, "reward": 0.748784214258194, "reward_std": 0.26408930122852325, "rewards/reward_fn/mean": 0.748784214258194, "rewards/reward_fn/std": 0.26408928632736206, "sampling/importance_sampling_ratio/max": 1.295841097831726, "sampling/importance_sampling_ratio/mean": 0.42000433802604675, "sampling/importance_sampling_ratio/min": 1.5540973436145578e-05, "sampling/sampling_logp_difference/max": 1.5349153280258179, "sampling/sampling_logp_difference/mean": 0.0059154212940484285, "step": 2700, "step_time": 6.940692375227809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2201.6666666666665, "completions/mean_length": 1392.90625, "completions/mean_terminated_length": 562.2186482747396, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.022494496405124666, "epoch": 0.32572115384615385, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.0073225838132202625, "learning_rate": 6.743990384615385e-07, "loss": -0.0036, "num_tokens": 61944192.0, "reward": 0.681248664855957, "reward_std": 0.2602781156698863, "rewards/reward_fn/mean": 0.681248664855957, "rewards/reward_fn/std": 0.2602781156698863, "sampling/importance_sampling_ratio/max": 1.0663028359413147, "sampling/importance_sampling_ratio/mean": 0.3102761010328929, "sampling/importance_sampling_ratio/min": 2.268404462787051e-05, "sampling/sampling_logp_difference/max": 2.5366442600886026, "sampling/sampling_logp_difference/mean": 0.0046813590452075005, "step": 2710, "step_time": 10.537433422729373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1437.546875, "completions/mean_terminated_length": 609.125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.022177499532699586, "epoch": 0.3269230769230769, "frac_reward_zero_std": 0.0, "grad_norm": 0.00442957878112793, "learning_rate": 6.731971153846153e-07, "loss": 0.0018, "num_tokens": 62116915.0, "reward": 0.6175533533096313, "reward_std": 0.33993931114673615, "rewards/reward_fn/mean": 0.6175533533096313, "rewards/reward_fn/std": 0.33993931114673615, "sampling/importance_sampling_ratio/max": 1.087655484676361, "sampling/importance_sampling_ratio/mean": 0.30229805409908295, "sampling/importance_sampling_ratio/min": 0.0002461662795276709, "sampling/sampling_logp_difference/max": 4.210425972938538, "sampling/sampling_logp_difference/mean": 0.005010226042941213, "step": 2720, "step_time": 7.250686732586473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2916666666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 2704.0, "completions/mean_length": 1293.75, "completions/mean_terminated_length": 621.3069356282552, "completions/min_length": 113.66666666666667, "completions/min_terminated_length": 113.66666666666667, "entropy": 0.02146989433094859, "epoch": 0.328125, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.009867419488728046, "learning_rate": 6.719951923076924e-07, "loss": -0.0033, "num_tokens": 62356475.0, "reward": 0.59722367922465, "reward_std": 0.3088560203711192, "rewards/reward_fn/mean": 0.59722367922465, "rewards/reward_fn/std": 0.3088560104370117, "sampling/importance_sampling_ratio/max": 1.1123722195625305, "sampling/importance_sampling_ratio/mean": 0.3866537759701411, "sampling/importance_sampling_ratio/min": 3.0023977160453796e-05, "sampling/sampling_logp_difference/max": 1.6859232584635417, "sampling/sampling_logp_difference/mean": 0.004541453284521897, "step": 2730, "step_time": 10.342506656330078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 3000.0, "completions/max_terminated_length": 2432.5, "completions/mean_length": 1606.703125, "completions/mean_terminated_length": 724.702392578125, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.024960623122751713, "epoch": 0.3293269230769231, "frac_reward_zero_std": 0.0, "grad_norm": 0.005814395844936371, "learning_rate": 6.707932692307692e-07, "loss": -0.0052, "num_tokens": 62535248.0, "reward": 0.67376908659935, "reward_std": 0.2841772064566612, "rewards/reward_fn/mean": 0.67376908659935, "rewards/reward_fn/std": 0.28417718410491943, "sampling/importance_sampling_ratio/max": 1.1763513088226318, "sampling/importance_sampling_ratio/mean": 0.19779972732067108, "sampling/importance_sampling_ratio/min": 0.00017831828017733642, "sampling/sampling_logp_difference/max": 2.2545769214630127, "sampling/sampling_logp_difference/mean": 0.00495361490175128, "step": 2740, "step_time": 7.218523137550801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3541666666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 2544.6666666666665, "completions/mean_length": 1426.5, "completions/mean_terminated_length": 573.5045166015625, "completions/min_length": 132.66666666666666, "completions/min_terminated_length": 132.66666666666666, "entropy": 0.02365986183285713, "epoch": 0.33052884615384615, "frac_reward_zero_std": 0.0, "grad_norm": 0.001892165862955153, "learning_rate": 6.695913461538461e-07, "loss": -0.004, "num_tokens": 62792032.0, "reward": 0.6948449412981669, "reward_std": 0.25716450810432434, "rewards/reward_fn/mean": 0.6948449412981669, "rewards/reward_fn/std": 0.25716450313727063, "sampling/importance_sampling_ratio/max": 2.091051538785299, "sampling/importance_sampling_ratio/mean": 0.36885348955790204, "sampling/importance_sampling_ratio/min": 2.819098851129335e-05, "sampling/sampling_logp_difference/max": 3.6619437535603843, "sampling/sampling_logp_difference/mean": 0.005043247093756993, "step": 2750, "step_time": 10.243680673558265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3000.0, "completions/max_terminated_length": 1365.0, "completions/mean_length": 1059.609375, "completions/mean_terminated_length": 412.79945373535156, "completions/min_length": 95.5, "completions/min_terminated_length": 95.5, "entropy": 0.024268843792378904, "epoch": 0.3317307692307692, "frac_reward_zero_std": 0.0, "grad_norm": 0.003944206051528454, "learning_rate": 6.683894230769231e-07, "loss": -0.0032, "num_tokens": 62926607.0, "reward": 0.7260787785053253, "reward_std": 0.2624466270208359, "rewards/reward_fn/mean": 0.7260787785053253, "rewards/reward_fn/std": 0.2624466270208359, "sampling/importance_sampling_ratio/max": 1.5636195540428162, "sampling/importance_sampling_ratio/mean": 0.42385871708393097, "sampling/importance_sampling_ratio/min": 2.731596134708525e-05, "sampling/sampling_logp_difference/max": 1.9311045408248901, "sampling/sampling_logp_difference/mean": 0.0055333818309009075, "step": 2760, "step_time": 6.951835496816784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2141.6666666666665, "completions/mean_length": 1443.4375, "completions/mean_terminated_length": 563.2229512532552, "completions/min_length": 141.66666666666666, "completions/min_terminated_length": 141.66666666666666, "entropy": 0.021136076003313065, "epoch": 0.3329326923076923, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.011586152948439121, "learning_rate": 6.671875e-07, "loss": -0.0041, "num_tokens": 63170433.0, "reward": 0.648607055346171, "reward_std": 0.2777187029520671, "rewards/reward_fn/mean": 0.648607055346171, "rewards/reward_fn/std": 0.2777187128861745, "sampling/importance_sampling_ratio/max": 1.711681107680003, "sampling/importance_sampling_ratio/mean": 0.3722013632456462, "sampling/importance_sampling_ratio/min": 0.0002835173918356304, "sampling/sampling_logp_difference/max": 3.241584221522013, "sampling/sampling_logp_difference/mean": 0.004254490602761507, "step": 2770, "step_time": 10.362419904675335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 3000.0, "completions/max_terminated_length": 1716.0, "completions/mean_length": 1166.203125, "completions/mean_terminated_length": 505.15309143066406, "completions/min_length": 135.5, "completions/min_terminated_length": 135.5, "entropy": 0.022739920392632486, "epoch": 0.33413461538461536, "frac_reward_zero_std": 0.0, "grad_norm": 0.004183382727205753, "learning_rate": 6.659855769230769e-07, "loss": 0.0174, "num_tokens": 63308766.0, "reward": 0.7427188158035278, "reward_std": 0.27576878666877747, "rewards/reward_fn/mean": 0.7427188158035278, "rewards/reward_fn/std": 0.27576878666877747, "sampling/importance_sampling_ratio/max": 1.3191691637039185, "sampling/importance_sampling_ratio/mean": 0.3269302695989609, "sampling/importance_sampling_ratio/min": 9.264476693715551e-05, "sampling/sampling_logp_difference/max": 1.8308717608451843, "sampling/sampling_logp_difference/mean": 0.00473081530071795, "step": 2780, "step_time": 6.960712177213281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3229166666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 2217.0, "completions/mean_length": 1319.6145833333333, "completions/mean_terminated_length": 611.6361185709635, "completions/min_length": 133.33333333333334, "completions/min_terminated_length": 133.33333333333334, "entropy": 0.019841845892369748, "epoch": 0.33533653846153844, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.006965190172195435, "learning_rate": 6.647836538461539e-07, "loss": -0.0029, "num_tokens": 63551889.0, "reward": 0.6118125915527344, "reward_std": 0.32725514968236286, "rewards/reward_fn/mean": 0.6118125915527344, "rewards/reward_fn/std": 0.32725514968236286, "sampling/importance_sampling_ratio/max": 1.4436321258544922, "sampling/importance_sampling_ratio/mean": 0.40511467307806015, "sampling/importance_sampling_ratio/min": 0.005175003861343915, "sampling/sampling_logp_difference/max": 1.682345191637675, "sampling/sampling_logp_difference/mean": 0.004487401184936364, "step": 2790, "step_time": 10.130945824272931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2320.5, "completions/mean_length": 1422.65625, "completions/mean_terminated_length": 657.2370300292969, "completions/min_length": 108.5, "completions/min_terminated_length": 108.5, "entropy": 0.02257414236664772, "epoch": 0.33653846153846156, "frac_reward_zero_std": 0.0, "grad_norm": 0.01958511769771576, "learning_rate": 6.635817307692307e-07, "loss": 0.0032, "num_tokens": 63720307.0, "reward": 0.7278270721435547, "reward_std": 0.2573973834514618, "rewards/reward_fn/mean": 0.7278270721435547, "rewards/reward_fn/std": 0.2573973834514618, "sampling/importance_sampling_ratio/max": 1.318169116973877, "sampling/importance_sampling_ratio/mean": 0.3256859630346298, "sampling/importance_sampling_ratio/min": 8.616350044121646e-05, "sampling/sampling_logp_difference/max": 2.960168957710266, "sampling/sampling_logp_difference/mean": 0.004553714068606496, "step": 2800, "step_time": 7.264800117071718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2708333333333333, "completions/max_length": 3000.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 1272.6354166666667, "completions/mean_terminated_length": 644.2155965169271, "completions/min_length": 127.33333333333333, "completions/min_terminated_length": 127.33333333333333, "entropy": 0.024489447847008704, "epoch": 0.33774038461538464, "frac_reward_zero_std": 0.0, "grad_norm": 0.004968682304024696, "learning_rate": 6.623798076923076e-07, "loss": -0.0001, "num_tokens": 63938640.0, "reward": 0.6827949086825053, "reward_std": 0.28945643703142804, "rewards/reward_fn/mean": 0.6827949086825053, "rewards/reward_fn/std": 0.28945643703142804, "sampling/importance_sampling_ratio/max": 1.3932164907455444, "sampling/importance_sampling_ratio/mean": 0.27409423887729645, "sampling/importance_sampling_ratio/min": 0.0003727265617878099, "sampling/sampling_logp_difference/max": 1.8496002753575642, "sampling/sampling_logp_difference/mean": 0.005376479743669431, "step": 2810, "step_time": 10.20981946270913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 3000.0, "completions/max_terminated_length": 1795.0, "completions/mean_length": 1587.484375, "completions/mean_terminated_length": 706.2364196777344, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.022299112379550935, "epoch": 0.3389423076923077, "frac_reward_zero_std": 0.0, "grad_norm": 0.009893151000142097, "learning_rate": 6.611778846153846e-07, "loss": 0.0121, "num_tokens": 64118015.0, "reward": 0.6286440789699554, "reward_std": 0.3504648357629776, "rewards/reward_fn/mean": 0.6286440789699554, "rewards/reward_fn/std": 0.3504648357629776, "sampling/importance_sampling_ratio/max": 1.778372585773468, "sampling/importance_sampling_ratio/mean": 0.2587231323122978, "sampling/importance_sampling_ratio/min": 7.283518993972393e-05, "sampling/sampling_logp_difference/max": 4.551160931587219, "sampling/sampling_logp_difference/mean": 0.004463450983166695, "step": 2820, "step_time": 7.18159779375419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3333333333333333, "completions/max_length": 3000.0, "completions/max_terminated_length": 2549.3333333333335, "completions/mean_length": 1369.15625, "completions/mean_terminated_length": 582.4644165039062, "completions/min_length": 124.33333333333333, "completions/min_terminated_length": 124.33333333333333, "entropy": 0.024375660344958305, "epoch": 0.3401442307692308, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.005761649925261736, "learning_rate": 6.599759615384615e-07, "loss": -0.0115, "num_tokens": 64404350.0, "reward": 0.719237486521403, "reward_std": 0.2615667333205541, "rewards/reward_fn/mean": 0.719237486521403, "rewards/reward_fn/std": 0.26156672338644665, "sampling/importance_sampling_ratio/max": 1.3692917823791504, "sampling/importance_sampling_ratio/mean": 0.33319635689258575, "sampling/importance_sampling_ratio/min": 0.00012865165285802505, "sampling/sampling_logp_difference/max": 3.744937221209208, "sampling/sampling_logp_difference/mean": 0.005222463825096686, "step": 2830, "step_time": 10.548396647255867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2693.0, "completions/mean_length": 1439.046875, "completions/mean_terminated_length": 688.5355529785156, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.02461354583501816, "epoch": 0.34134615384615385, "frac_reward_zero_std": 0.0, "grad_norm": 0.035875916481018066, "learning_rate": 6.587740384615384e-07, "loss": 0.005, "num_tokens": 64567385.0, "reward": 0.6952836215496063, "reward_std": 0.24831197410821915, "rewards/reward_fn/mean": 0.6952836215496063, "rewards/reward_fn/std": 0.24831197410821915, "sampling/importance_sampling_ratio/max": 2.590697407722473, "sampling/importance_sampling_ratio/mean": 0.35878223180770874, "sampling/importance_sampling_ratio/min": 7.324758882987226e-06, "sampling/sampling_logp_difference/max": 2.832168936729431, "sampling/sampling_logp_difference/mean": 0.005060225958004594, "step": 2840, "step_time": 7.1174449029378595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3854166666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 1280.6666666666667, "completions/mean_length": 1415.0104166666667, "completions/mean_terminated_length": 437.4675699869792, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.025270891934633256, "epoch": 0.3425480769230769, "frac_reward_zero_std": 0.0, "grad_norm": 0.007479832507669926, "learning_rate": 6.575721153846153e-07, "loss": 0.0016, "num_tokens": 64806498.0, "reward": 0.6728821992874146, "reward_std": 0.26547883450984955, "rewards/reward_fn/mean": 0.6728821992874146, "rewards/reward_fn/std": 0.26547881960868835, "sampling/importance_sampling_ratio/max": 1.5888238151868184, "sampling/importance_sampling_ratio/mean": 0.31050849954287213, "sampling/importance_sampling_ratio/min": 1.1158983625136898e-05, "sampling/sampling_logp_difference/max": 3.654442310333252, "sampling/sampling_logp_difference/mean": 0.005750813987106085, "step": 2850, "step_time": 10.463823213987052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 3000.0, "completions/max_terminated_length": 1662.0, "completions/mean_length": 960.25, "completions/mean_terminated_length": 438.4768524169922, "completions/min_length": 103.5, "completions/min_terminated_length": 103.5, "entropy": 0.028149347752332687, "epoch": 0.34375, "frac_reward_zero_std": 0.0, "grad_norm": 0.006289340555667877, "learning_rate": 6.563701923076923e-07, "loss": 0.0032, "num_tokens": 64939802.0, "reward": 0.7745629847049713, "reward_std": 0.2281152456998825, "rewards/reward_fn/mean": 0.7745629847049713, "rewards/reward_fn/std": 0.2281152382493019, "sampling/importance_sampling_ratio/max": 1.1176949739456177, "sampling/importance_sampling_ratio/mean": 0.32767000049352646, "sampling/importance_sampling_ratio/min": 1.4705753073940286e-05, "sampling/sampling_logp_difference/max": 1.6435200572013855, "sampling/sampling_logp_difference/mean": 0.005995213985443115, "step": 2860, "step_time": 6.98508583381772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2289.0, "completions/mean_length": 1276.9895833333333, "completions/mean_terminated_length": 649.7674763997396, "completions/min_length": 165.33333333333334, "completions/min_terminated_length": 165.33333333333334, "entropy": 0.023813378438353537, "epoch": 0.3449519230769231, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.006391987670212984, "learning_rate": 6.551682692307691e-07, "loss": -0.0052, "num_tokens": 65180553.0, "reward": 0.6917014718055725, "reward_std": 0.29476818939050037, "rewards/reward_fn/mean": 0.6917014718055725, "rewards/reward_fn/std": 0.29476816455523175, "sampling/importance_sampling_ratio/max": 1.1695090134938557, "sampling/importance_sampling_ratio/mean": 0.3063147912422816, "sampling/importance_sampling_ratio/min": 7.894460465488843e-06, "sampling/sampling_logp_difference/max": 4.411165475845337, "sampling/sampling_logp_difference/mean": 0.005292367190122604, "step": 2870, "step_time": 10.481195047311484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2745.5, "completions/mean_length": 1337.328125, "completions/mean_terminated_length": 527.4695739746094, "completions/min_length": 87.5, "completions/min_terminated_length": 87.5, "entropy": 0.026441592909395694, "epoch": 0.34615384615384615, "frac_reward_zero_std": 0.0, "grad_norm": 0.007728982251137495, "learning_rate": 6.539663461538462e-07, "loss": -0.0053, "num_tokens": 65353574.0, "reward": 0.702043890953064, "reward_std": 0.27345073223114014, "rewards/reward_fn/mean": 0.702043890953064, "rewards/reward_fn/std": 0.27345071732997894, "sampling/importance_sampling_ratio/max": 1.526147484779358, "sampling/importance_sampling_ratio/mean": 0.3064705729484558, "sampling/importance_sampling_ratio/min": 3.914032276952639e-06, "sampling/sampling_logp_difference/max": 2.6018656492233276, "sampling/sampling_logp_difference/mean": 0.006068493938073516, "step": 2880, "step_time": 7.046322750672698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2708333333333333, "completions/max_length": 3000.0, "completions/max_terminated_length": 1778.6666666666667, "completions/mean_length": 1116.8125, "completions/mean_terminated_length": 423.20752970377606, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.023579319939017296, "epoch": 0.3473557692307692, "frac_reward_zero_std": 0.16666666666666666, "grad_norm": 0.011693094857037067, "learning_rate": 6.52764423076923e-07, "loss": 0.0056, "num_tokens": 65565956.0, "reward": 0.6449399590492249, "reward_std": 0.2600061098734538, "rewards/reward_fn/mean": 0.6449399590492249, "rewards/reward_fn/std": 0.2600061148405075, "sampling/importance_sampling_ratio/max": 1.154957930246989, "sampling/importance_sampling_ratio/mean": 0.3675974855820338, "sampling/importance_sampling_ratio/min": 0.0006945865960309069, "sampling/sampling_logp_difference/max": 3.6303712526957193, "sampling/sampling_logp_difference/mean": 0.0045376788669576245, "step": 2890, "step_time": 10.048805410601199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 3000.0, "completions/max_terminated_length": 1449.5, "completions/mean_length": 1762.5625, "completions/mean_terminated_length": 524.9870300292969, "completions/min_length": 173.5, "completions/min_terminated_length": 173.5, "entropy": 0.02184396293014288, "epoch": 0.3485576923076923, "frac_reward_zero_std": 0.0, "grad_norm": 0.002143600955605507, "learning_rate": 6.515625000000001e-07, "loss": -0.0046, "num_tokens": 65765264.0, "reward": 0.673580527305603, "reward_std": 0.2465047836303711, "rewards/reward_fn/mean": 0.673580527305603, "rewards/reward_fn/std": 0.2465047836303711, "sampling/importance_sampling_ratio/max": 1.0537468791007996, "sampling/importance_sampling_ratio/mean": 0.22259005904197693, "sampling/importance_sampling_ratio/min": 2.3181696102714255e-06, "sampling/sampling_logp_difference/max": 5.49142062664032, "sampling/sampling_logp_difference/mean": 0.00507719861343503, "step": 2900, "step_time": 7.228238931484521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3333333333333333, "completions/max_length": 3000.0, "completions/max_terminated_length": 1965.0, "completions/mean_length": 1358.9166666666667, "completions/mean_terminated_length": 554.7853495279948, "completions/min_length": 143.33333333333334, "completions/min_terminated_length": 143.33333333333334, "entropy": 0.021136753261089325, "epoch": 0.34975961538461536, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.0159227903932333, "learning_rate": 6.503605769230769e-07, "loss": -0.0013, "num_tokens": 65991568.0, "reward": 0.6382514635721842, "reward_std": 0.3126610517501831, "rewards/reward_fn/mean": 0.6382514635721842, "rewards/reward_fn/std": 0.3126610418160756, "sampling/importance_sampling_ratio/max": 1.1462655862172444, "sampling/importance_sampling_ratio/mean": 0.3566558261712392, "sampling/importance_sampling_ratio/min": 4.11288813969198e-05, "sampling/sampling_logp_difference/max": 2.4219170014063516, "sampling/sampling_logp_difference/mean": 0.004534831425795953, "step": 2910, "step_time": 10.334375515487045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 3000.0, "completions/max_terminated_length": 2138.5, "completions/mean_length": 1637.3125, "completions/mean_terminated_length": 729.8493041992188, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.02673260625451803, "epoch": 0.35096153846153844, "frac_reward_zero_std": 0.0, "grad_norm": 0.004544360097497702, "learning_rate": 6.491586538461538e-07, "loss": 0.0027, "num_tokens": 66172564.0, "reward": 0.669700026512146, "reward_std": 0.27684013545513153, "rewards/reward_fn/mean": 0.669700026512146, "rewards/reward_fn/std": 0.2768401503562927, "sampling/importance_sampling_ratio/max": 1.2635446190834045, "sampling/importance_sampling_ratio/mean": 0.16515055298805237, "sampling/importance_sampling_ratio/min": 8.269995362297777e-06, "sampling/sampling_logp_difference/max": 4.098972678184509, "sampling/sampling_logp_difference/mean": 0.0058789062313735485, "step": 2920, "step_time": 7.301574968267232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2604166666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 1865.6666666666667, "completions/mean_length": 1175.3125, "completions/mean_terminated_length": 538.9326782226562, "completions/min_length": 141.33333333333334, "completions/min_terminated_length": 141.33333333333334, "entropy": 0.02621860187500715, "epoch": 0.35216346153846156, "frac_reward_zero_std": 0.0, "grad_norm": 0.008521990850567818, "learning_rate": 6.479567307692308e-07, "loss": -0.0038, "num_tokens": 66398394.0, "reward": 0.7557272911071777, "reward_std": 0.23599664866924286, "rewards/reward_fn/mean": 0.7557272911071777, "rewards/reward_fn/std": 0.2359966387351354, "sampling/importance_sampling_ratio/max": 1.8331321875254314, "sampling/importance_sampling_ratio/mean": 0.34085626900196075, "sampling/importance_sampling_ratio/min": 8.138395302618544e-05, "sampling/sampling_logp_difference/max": 2.502666195233663, "sampling/sampling_logp_difference/mean": 0.00503028929233551, "step": 2930, "step_time": 10.428045582026243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2326.5, "completions/mean_length": 1438.6875, "completions/mean_terminated_length": 639.8398284912109, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.024043994955718517, "epoch": 0.35336538461538464, "frac_reward_zero_std": 0.0, "grad_norm": 0.0018297630595043302, "learning_rate": 6.467548076923076e-07, "loss": -0.0031, "num_tokens": 66567950.0, "reward": 0.6398105025291443, "reward_std": 0.3290172517299652, "rewards/reward_fn/mean": 0.6398105025291443, "rewards/reward_fn/std": 0.3290172666311264, "sampling/importance_sampling_ratio/max": 1.4230411350727081, "sampling/importance_sampling_ratio/mean": 0.2526058405637741, "sampling/importance_sampling_ratio/min": 0.00011081814591307193, "sampling/sampling_logp_difference/max": 5.396750807762146, "sampling/sampling_logp_difference/mean": 0.005341190844774246, "step": 2940, "step_time": 7.243143723811954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3333333333333333, "completions/max_length": 3000.0, "completions/max_terminated_length": 1514.0, "completions/mean_length": 1371.2291666666667, "completions/mean_terminated_length": 549.2199300130209, "completions/min_length": 144.33333333333334, "completions/min_terminated_length": 144.33333333333334, "entropy": 0.024231213517487048, "epoch": 0.3545673076923077, "frac_reward_zero_std": 0.0, "grad_norm": 0.020785922184586525, "learning_rate": 6.455528846153846e-07, "loss": -0.0047, "num_tokens": 66821884.0, "reward": 0.652157723903656, "reward_std": 0.3085122307141622, "rewards/reward_fn/mean": 0.652157723903656, "rewards/reward_fn/std": 0.3085122307141622, "sampling/importance_sampling_ratio/max": 1.522047479947408, "sampling/importance_sampling_ratio/mean": 0.31376107533772785, "sampling/importance_sampling_ratio/min": 4.2005350375499496e-05, "sampling/sampling_logp_difference/max": 4.205764214197795, "sampling/sampling_logp_difference/mean": 0.005020102330793937, "step": 2950, "step_time": 10.442666785791516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 3000.0, "completions/max_terminated_length": 1604.5, "completions/mean_length": 1398.25, "completions/mean_terminated_length": 548.3000030517578, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.01980907116085291, "epoch": 0.3557692307692308, "frac_reward_zero_std": 0.125, "grad_norm": 0.0023427624255418777, "learning_rate": 6.443509615384615e-07, "loss": 0.0054, "num_tokens": 66974716.0, "reward": 0.6270863562822342, "reward_std": 0.29034140706062317, "rewards/reward_fn/mean": 0.6270863562822342, "rewards/reward_fn/std": 0.29034140706062317, "sampling/importance_sampling_ratio/max": 1.550650715827942, "sampling/importance_sampling_ratio/mean": 0.3382941782474518, "sampling/importance_sampling_ratio/min": 0.0005002390826120973, "sampling/sampling_logp_difference/max": 4.211296916007996, "sampling/sampling_logp_difference/mean": 0.004067927715368569, "step": 2960, "step_time": 7.009641246404499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.23958333333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 2709.6666666666665, "completions/mean_length": 1112.21875, "completions/mean_terminated_length": 531.973642985026, "completions/min_length": 161.33333333333334, "completions/min_terminated_length": 161.33333333333334, "entropy": 0.025837836042046546, "epoch": 0.35697115384615385, "frac_reward_zero_std": 0.0, "grad_norm": 0.03680206090211868, "learning_rate": 6.431490384615384e-07, "loss": 0.0081, "num_tokens": 67191217.0, "reward": 0.7405746579170227, "reward_std": 0.256101797024409, "rewards/reward_fn/mean": 0.7405746579170227, "rewards/reward_fn/std": 0.2561017870903015, "sampling/importance_sampling_ratio/max": 1.4813727140426636, "sampling/importance_sampling_ratio/mean": 0.365856612722079, "sampling/importance_sampling_ratio/min": 3.274912554237138e-05, "sampling/sampling_logp_difference/max": 3.4103230635325112, "sampling/sampling_logp_difference/mean": 0.005764266941696405, "step": 2970, "step_time": 10.380710415635258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 3000.0, "completions/max_terminated_length": 1237.0, "completions/mean_length": 1084.75, "completions/mean_terminated_length": 323.6876220703125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.02251967675983906, "epoch": 0.3581730769230769, "frac_reward_zero_std": 0.0, "grad_norm": 0.008451610803604126, "learning_rate": 6.419471153846153e-07, "loss": -0.0033, "num_tokens": 67334993.0, "reward": 0.7365331947803497, "reward_std": 0.24342426657676697, "rewards/reward_fn/mean": 0.7365331947803497, "rewards/reward_fn/std": 0.24342425167560577, "sampling/importance_sampling_ratio/max": 1.2367245554924011, "sampling/importance_sampling_ratio/mean": 0.4230058193206787, "sampling/importance_sampling_ratio/min": 7.729569460934727e-05, "sampling/sampling_logp_difference/max": 2.1217567324638367, "sampling/sampling_logp_difference/mean": 0.005039701471105218, "step": 2980, "step_time": 7.093800844717771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2152.3333333333335, "completions/mean_length": 1579.1979166666667, "completions/mean_terminated_length": 763.3961181640625, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.021907702088356018, "epoch": 0.359375, "frac_reward_zero_std": 0.0, "grad_norm": 0.028141945600509644, "learning_rate": 6.407451923076923e-07, "loss": -0.0014, "num_tokens": 67625292.0, "reward": 0.6943406263987223, "reward_std": 0.28487402697404224, "rewards/reward_fn/mean": 0.6943406263987223, "rewards/reward_fn/std": 0.28487400710582733, "sampling/importance_sampling_ratio/max": 1.3406153122584026, "sampling/importance_sampling_ratio/mean": 0.2690477470556895, "sampling/importance_sampling_ratio/min": 8.208502765683079e-05, "sampling/sampling_logp_difference/max": 2.896640499432882, "sampling/sampling_logp_difference/mean": 0.004800545672575633, "step": 2990, "step_time": 10.63472943753004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 2782.5, "completions/max_terminated_length": 2620.5, "completions/mean_length": 1054.8125, "completions/mean_terminated_length": 679.1875, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.021935983374714852, "epoch": 0.3605769230769231, "frac_reward_zero_std": 0.125, "grad_norm": 0.006156194489449263, "learning_rate": 6.395432692307692e-07, "loss": 0.0035, "num_tokens": 67761896.0, "reward": 0.6699436008930206, "reward_std": 0.22686780989170074, "rewards/reward_fn/mean": 0.6699436008930206, "rewards/reward_fn/std": 0.22686781734228134, "sampling/importance_sampling_ratio/max": 1.500333845615387, "sampling/importance_sampling_ratio/mean": 0.3913382887840271, "sampling/importance_sampling_ratio/min": 8.264028167559445e-06, "sampling/sampling_logp_difference/max": 6.750271320343018, "sampling/sampling_logp_difference/mean": 0.004568589734844863, "step": 3000, "step_time": 6.52237644745037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3645833333333333, "completions/max_length": 3000.0, "completions/max_terminated_length": 2242.3333333333335, "completions/mean_length": 1514.1666666666667, "completions/mean_terminated_length": 681.1017049153646, "completions/min_length": 173.66666666666666, "completions/min_terminated_length": 173.66666666666666, "entropy": 0.020713152550160884, "epoch": 0.36177884615384615, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.005513470619916916, "learning_rate": 6.383413461538461e-07, "loss": 0.0038, "num_tokens": 68009216.0, "reward": 0.6710994442303976, "reward_std": 0.28816449642181396, "rewards/reward_fn/mean": 0.6710994442303976, "rewards/reward_fn/std": 0.28816449642181396, "sampling/importance_sampling_ratio/max": 1.3968857924143474, "sampling/importance_sampling_ratio/mean": 0.29969679315884906, "sampling/importance_sampling_ratio/min": 0.00015881623160870126, "sampling/sampling_logp_difference/max": 3.8023016452789307, "sampling/sampling_logp_difference/mean": 0.004565134023626645, "step": 3010, "step_time": 10.301725914608687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 3000.0, "completions/max_terminated_length": 1771.0, "completions/mean_length": 1725.5625, "completions/mean_terminated_length": 664.5833435058594, "completions/min_length": 213.5, "completions/min_terminated_length": 213.5, "entropy": 0.023889271728694438, "epoch": 0.3629807692307692, "frac_reward_zero_std": 0.0, "grad_norm": 0.028035586699843407, "learning_rate": 6.371394230769231e-07, "loss": 0.0056, "num_tokens": 68196004.0, "reward": 0.6496320068836212, "reward_std": 0.2575787231326103, "rewards/reward_fn/mean": 0.6496320068836212, "rewards/reward_fn/std": 0.25757869333028793, "sampling/importance_sampling_ratio/max": 1.0891358256340027, "sampling/importance_sampling_ratio/mean": 0.17336281388998032, "sampling/importance_sampling_ratio/min": 7.302998028535512e-06, "sampling/sampling_logp_difference/max": 2.7808388471603394, "sampling/sampling_logp_difference/mean": 0.005540127167478204, "step": 3020, "step_time": 7.238953069318086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3333333333333333, "completions/max_length": 3000.0, "completions/max_terminated_length": 2369.6666666666665, "completions/mean_length": 1395.8229166666667, "completions/mean_terminated_length": 624.3418782552084, "completions/min_length": 173.33333333333334, "completions/min_terminated_length": 173.33333333333334, "entropy": 0.02443194277584553, "epoch": 0.3641826923076923, "frac_reward_zero_std": 0.0, "grad_norm": 0.020531782880425453, "learning_rate": 6.359375e-07, "loss": -0.0008, "num_tokens": 68446571.0, "reward": 0.7064047058423361, "reward_std": 0.2532275418440501, "rewards/reward_fn/mean": 0.7064047058423361, "rewards/reward_fn/std": 0.25322753687699634, "sampling/importance_sampling_ratio/max": 1.1966773668924968, "sampling/importance_sampling_ratio/mean": 0.2506915181875229, "sampling/importance_sampling_ratio/min": 0.00015090173013732056, "sampling/sampling_logp_difference/max": 3.2203840812047324, "sampling/sampling_logp_difference/mean": 0.005708196510871251, "step": 3030, "step_time": 10.34355279840529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 3000.0, "completions/max_terminated_length": 1574.0, "completions/mean_length": 1701.34375, "completions/mean_terminated_length": 667.4548645019531, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.025492165610194205, "epoch": 0.36538461538461536, "frac_reward_zero_std": 0.0, "grad_norm": 0.0035250582732260227, "learning_rate": 6.347355769230768e-07, "loss": -0.0031, "num_tokens": 68625593.0, "reward": 0.6481672525405884, "reward_std": 0.28108350932598114, "rewards/reward_fn/mean": 0.6481672525405884, "rewards/reward_fn/std": 0.28108353912830353, "sampling/importance_sampling_ratio/max": 1.2126761972904205, "sampling/importance_sampling_ratio/mean": 0.17512930184602737, "sampling/importance_sampling_ratio/min": 3.208899170203949e-05, "sampling/sampling_logp_difference/max": 2.861757457256317, "sampling/sampling_logp_difference/mean": 0.0049594196025282145, "step": 3040, "step_time": 7.140995558537543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 3000.0, "completions/max_terminated_length": 2140.6666666666665, "completions/mean_length": 998.59375, "completions/mean_terminated_length": 444.31871541341144, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.02243505250662565, "epoch": 0.36658653846153844, "frac_reward_zero_std": 0.16666666666666666, "grad_norm": 0.0028140845242887735, "learning_rate": 6.335336538461539e-07, "loss": 0.0042, "num_tokens": 68846482.0, "reward": 0.7939872145652771, "reward_std": 0.22685289879639944, "rewards/reward_fn/mean": 0.7939872145652771, "rewards/reward_fn/std": 0.2268528789281845, "sampling/importance_sampling_ratio/max": 1.4230415026346843, "sampling/importance_sampling_ratio/mean": 0.44450585047403973, "sampling/importance_sampling_ratio/min": 0.00016543706099862257, "sampling/sampling_logp_difference/max": 6.04352339108785, "sampling/sampling_logp_difference/mean": 0.004573013788710038, "step": 3050, "step_time": 10.33615669719875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 3000.0, "completions/max_terminated_length": 1166.5, "completions/mean_length": 1644.28125, "completions/mean_terminated_length": 374.87779235839844, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.021112394332885743, "epoch": 0.36778846153846156, "frac_reward_zero_std": 0.0, "grad_norm": 0.0025510413106530905, "learning_rate": 6.323317307692307e-07, "loss": 0.0019, "num_tokens": 69027436.0, "reward": 0.6195077300071716, "reward_std": 0.29553721845149994, "rewards/reward_fn/mean": 0.6195077300071716, "rewards/reward_fn/std": 0.29553720355033875, "sampling/importance_sampling_ratio/max": 1.9045431017875671, "sampling/importance_sampling_ratio/mean": 0.3043844401836395, "sampling/importance_sampling_ratio/min": 8.06992011348484e-06, "sampling/sampling_logp_difference/max": 3.131243944168091, "sampling/sampling_logp_difference/mean": 0.005196343641728163, "step": 3060, "step_time": 7.181386069022119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 3000.0, "completions/max_terminated_length": 1722.3333333333333, "completions/mean_length": 1246.3125, "completions/mean_terminated_length": 452.1782735188802, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.023517537862062454, "epoch": 0.36899038461538464, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.007885362952947617, "learning_rate": 6.311298076923077e-07, "loss": -0.0032, "num_tokens": 69246714.0, "reward": 0.6933804551760355, "reward_std": 0.26283960044384, "rewards/reward_fn/mean": 0.6933804551760355, "rewards/reward_fn/std": 0.26283960044384, "sampling/importance_sampling_ratio/max": 1.6651012301445007, "sampling/importance_sampling_ratio/mean": 0.3519500195980072, "sampling/importance_sampling_ratio/min": 1.5718702949622337e-05, "sampling/sampling_logp_difference/max": 2.850069999694824, "sampling/sampling_logp_difference/mean": 0.005274124443531036, "step": 3070, "step_time": 10.428635131008923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3000.0, "completions/max_terminated_length": 2402.0, "completions/mean_length": 1341.359375, "completions/mean_terminated_length": 774.2536010742188, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.024085666984319687, "epoch": 0.3701923076923077, "frac_reward_zero_std": 0.0, "grad_norm": 0.003475816920399666, "learning_rate": 6.299278846153846e-07, "loss": -0.0043, "num_tokens": 69411665.0, "reward": 0.7101583480834961, "reward_std": 0.29760176688432693, "rewards/reward_fn/mean": 0.7101583480834961, "rewards/reward_fn/std": 0.29760176688432693, "sampling/importance_sampling_ratio/max": 1.8650830388069153, "sampling/importance_sampling_ratio/mean": 0.25640759617090225, "sampling/importance_sampling_ratio/min": 0.0001539538870929391, "sampling/sampling_logp_difference/max": 2.837941586971283, "sampling/sampling_logp_difference/mean": 0.0053916811011731625, "step": 3080, "step_time": 7.087378838937729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3000.0, "completions/max_terminated_length": 2175.3333333333335, "completions/mean_length": 1217.5833333333333, "completions/mean_terminated_length": 603.979990641276, "completions/min_length": 148.66666666666666, "completions/min_terminated_length": 148.66666666666666, "entropy": 0.022752970270812512, "epoch": 0.3713942307692308, "frac_reward_zero_std": 0.0, "grad_norm": 0.003993566147983074, "learning_rate": 6.287259615384614e-07, "loss": -0.0011, "num_tokens": 69641369.0, "reward": 0.7717539072036743, "reward_std": 0.22359096507231394, "rewards/reward_fn/mean": 0.7717539072036743, "rewards/reward_fn/std": 0.22359096010526022, "sampling/importance_sampling_ratio/max": 1.4935739835103352, "sampling/importance_sampling_ratio/mean": 0.36464767654736835, "sampling/importance_sampling_ratio/min": 7.320867734961212e-05, "sampling/sampling_logp_difference/max": 2.189392844835917, "sampling/sampling_logp_difference/mean": 0.00506069449086984, "step": 3090, "step_time": 10.181432674545794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3000.0, "completions/max_terminated_length": 2522.5, "completions/mean_length": 1173.90625, "completions/mean_terminated_length": 587.5785827636719, "completions/min_length": 138.5, "completions/min_terminated_length": 138.5, "entropy": 0.027482195943593978, "epoch": 0.37259615384615385, "frac_reward_zero_std": 0.0, "grad_norm": 0.012232874520123005, "learning_rate": 6.275240384615385e-07, "loss": -0.0054, "num_tokens": 69791427.0, "reward": 0.7725109755992889, "reward_std": 0.23830509930849075, "rewards/reward_fn/mean": 0.7725109755992889, "rewards/reward_fn/std": 0.23830511420965195, "sampling/importance_sampling_ratio/max": 1.4012246131896973, "sampling/importance_sampling_ratio/mean": 0.36566951870918274, "sampling/importance_sampling_ratio/min": 5.957988463478614e-05, "sampling/sampling_logp_difference/max": 1.7543874979019165, "sampling/sampling_logp_difference/mean": 0.006174272391945124, "step": 3100, "step_time": 7.117768373619765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2057.3333333333335, "completions/mean_length": 1656.65625, "completions/mean_terminated_length": 644.2201131184896, "completions/min_length": 198.33333333333334, "completions/min_terminated_length": 198.33333333333334, "entropy": 0.02345912279561162, "epoch": 0.3737980769230769, "frac_reward_zero_std": 0.0, "grad_norm": 0.005087500903755426, "learning_rate": 6.263221153846153e-07, "loss": 0.0063, "num_tokens": 70075410.0, "reward": 0.6330714225769043, "reward_std": 0.30039971073468524, "rewards/reward_fn/mean": 0.6330714225769043, "rewards/reward_fn/std": 0.30039971073468524, "sampling/importance_sampling_ratio/max": 2.0206180016199746, "sampling/importance_sampling_ratio/mean": 0.2747419277826945, "sampling/importance_sampling_ratio/min": 1.548034144131331e-06, "sampling/sampling_logp_difference/max": 6.240050872166951, "sampling/sampling_logp_difference/mean": 0.004924255112806956, "step": 3110, "step_time": 10.516085989773273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 3000.0, "completions/max_terminated_length": 1540.0, "completions/mean_length": 929.125, "completions/mean_terminated_length": 402.61802673339844, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.02326902747154236, "epoch": 0.375, "frac_reward_zero_std": 0.0, "grad_norm": 0.003840809455141425, "learning_rate": 6.251201923076923e-07, "loss": -0.003, "num_tokens": 70206826.0, "reward": 0.7745409607887268, "reward_std": 0.2394006922841072, "rewards/reward_fn/mean": 0.7745409607887268, "rewards/reward_fn/std": 0.2394007071852684, "sampling/importance_sampling_ratio/max": 1.302371323108673, "sampling/importance_sampling_ratio/mean": 0.49422410130500793, "sampling/importance_sampling_ratio/min": 0.00020685233175754547, "sampling/sampling_logp_difference/max": 2.5011112689971924, "sampling/sampling_logp_difference/mean": 0.004646181594580412, "step": 3120, "step_time": 6.84160819798708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.16666666666666666, "completions/max_length": 3000.0, "completions/max_terminated_length": 2403.6666666666665, "completions/mean_length": 924.2708333333334, "completions/mean_terminated_length": 517.433095296224, "completions/min_length": 109.66666666666667, "completions/min_terminated_length": 109.66666666666667, "entropy": 0.02387548368424177, "epoch": 0.3762019230769231, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.005209248512983322, "learning_rate": 6.239182692307692e-07, "loss": -0.0083, "num_tokens": 70403124.0, "reward": 0.7509336272875468, "reward_std": 0.2238088051478068, "rewards/reward_fn/mean": 0.7509336272875468, "rewards/reward_fn/std": 0.22380880018075308, "sampling/importance_sampling_ratio/max": 1.7557673851648967, "sampling/importance_sampling_ratio/mean": 0.40156761805216473, "sampling/importance_sampling_ratio/min": 0.002102234788859884, "sampling/sampling_logp_difference/max": 2.3269541263580322, "sampling/sampling_logp_difference/mean": 0.005116572293142478, "step": 3130, "step_time": 9.959940402489156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 3000.0, "completions/max_terminated_length": 2308.5, "completions/mean_length": 971.46875, "completions/mean_terminated_length": 503.34617614746094, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.02250025849789381, "epoch": 0.37740384615384615, "frac_reward_zero_std": 0.125, "grad_norm": 0.04091005027294159, "learning_rate": 6.227163461538462e-07, "loss": -0.0131, "num_tokens": 70524178.0, "reward": 0.7841638326644897, "reward_std": 0.23956956714391708, "rewards/reward_fn/mean": 0.7841638326644897, "rewards/reward_fn/std": 0.23956957459449768, "sampling/importance_sampling_ratio/max": 2.1148310899734497, "sampling/importance_sampling_ratio/mean": 0.534813791513443, "sampling/importance_sampling_ratio/min": 0.00030954073736211285, "sampling/sampling_logp_difference/max": 2.7452861070632935, "sampling/sampling_logp_difference/mean": 0.004469587933272123, "step": 3140, "step_time": 6.900625597499311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3229166666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 2598.6666666666665, "completions/mean_length": 1472.8854166666667, "completions/mean_terminated_length": 747.9812418619791, "completions/min_length": 173.66666666666666, "completions/min_terminated_length": 173.66666666666666, "entropy": 0.023403120413422584, "epoch": 0.3786057692307692, "frac_reward_zero_std": 0.0, "grad_norm": 0.006825059652328491, "learning_rate": 6.21514423076923e-07, "loss": -0.0042, "num_tokens": 70762375.0, "reward": 0.6158232490221659, "reward_std": 0.3144171138604482, "rewards/reward_fn/mean": 0.6158232490221659, "rewards/reward_fn/std": 0.31441710392634076, "sampling/importance_sampling_ratio/max": 2.2232763369878135, "sampling/importance_sampling_ratio/mean": 0.33772164583206177, "sampling/importance_sampling_ratio/min": 1.2113751987878155e-05, "sampling/sampling_logp_difference/max": 2.8147602478663125, "sampling/sampling_logp_difference/mean": 0.004998668096959591, "step": 3150, "step_time": 10.373091126792133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3000.0, "completions/max_terminated_length": 1524.5, "completions/mean_length": 1085.25, "completions/mean_terminated_length": 439.4510498046875, "completions/min_length": 194.5, "completions/min_terminated_length": 194.5, "entropy": 0.024088918603956698, "epoch": 0.3798076923076923, "frac_reward_zero_std": 0.0, "grad_norm": 0.01166957151144743, "learning_rate": 6.203125e-07, "loss": -0.0061, "num_tokens": 70908351.0, "reward": 0.7192789316177368, "reward_std": 0.26222922652959824, "rewards/reward_fn/mean": 0.7192789316177368, "rewards/reward_fn/std": 0.26222922652959824, "sampling/importance_sampling_ratio/max": 1.1536340117454529, "sampling/importance_sampling_ratio/mean": 0.3759918510913849, "sampling/importance_sampling_ratio/min": 0.00048190249890467385, "sampling/sampling_logp_difference/max": 3.4000518321990967, "sampling/sampling_logp_difference/mean": 0.004690395784564316, "step": 3160, "step_time": 6.951845482550562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 3000.0, "completions/max_terminated_length": 1835.3333333333333, "completions/mean_length": 1223.59375, "completions/mean_terminated_length": 522.0562032063802, "completions/min_length": 147.66666666666666, "completions/min_terminated_length": 147.66666666666666, "entropy": 0.02728066425770521, "epoch": 0.38100961538461536, "frac_reward_zero_std": 0.0, "grad_norm": 0.008466286584734917, "learning_rate": 6.191105769230769e-07, "loss": -0.0034, "num_tokens": 71139088.0, "reward": 0.7189341187477112, "reward_std": 0.2740337649981181, "rewards/reward_fn/mean": 0.7189341187477112, "rewards/reward_fn/std": 0.2740337649981181, "sampling/importance_sampling_ratio/max": 1.3166531324386597, "sampling/importance_sampling_ratio/mean": 0.33803559343020123, "sampling/importance_sampling_ratio/min": 0.00022033012252601716, "sampling/sampling_logp_difference/max": 1.9375563859939575, "sampling/sampling_logp_difference/mean": 0.006086159031838179, "step": 3170, "step_time": 10.288920569140464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 3000.0, "completions/max_terminated_length": 1947.0, "completions/mean_length": 983.5, "completions/mean_terminated_length": 511.76463317871094, "completions/min_length": 121.5, "completions/min_terminated_length": 121.5, "entropy": 0.020701412856578828, "epoch": 0.38221153846153844, "frac_reward_zero_std": 0.125, "grad_norm": 0.03458762541413307, "learning_rate": 6.179086538461538e-07, "loss": -0.0023, "num_tokens": 71261584.0, "reward": 0.555366113781929, "reward_std": 0.37044259905815125, "rewards/reward_fn/mean": 0.555366113781929, "rewards/reward_fn/std": 0.37044258415699005, "sampling/importance_sampling_ratio/max": 1.0851582288742065, "sampling/importance_sampling_ratio/mean": 0.44565530121326447, "sampling/importance_sampling_ratio/min": 2.3343252905760892e-05, "sampling/sampling_logp_difference/max": 1.9883873462677002, "sampling/sampling_logp_difference/mean": 0.004114025272428989, "step": 3180, "step_time": 6.636102172359824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2604166666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 1791.3333333333333, "completions/mean_length": 1088.0833333333333, "completions/mean_terminated_length": 411.68797810872394, "completions/min_length": 125.33333333333333, "completions/min_terminated_length": 125.33333333333333, "entropy": 0.02518922369927168, "epoch": 0.38341346153846156, "frac_reward_zero_std": 0.0, "grad_norm": 0.010143889114260674, "learning_rate": 6.167067307692308e-07, "loss": -0.0051, "num_tokens": 71477896.0, "reward": 0.6878215670585632, "reward_std": 0.26933689912160236, "rewards/reward_fn/mean": 0.6878215670585632, "rewards/reward_fn/std": 0.26933688918749493, "sampling/importance_sampling_ratio/max": 1.6348201831181843, "sampling/importance_sampling_ratio/mean": 0.37670350074768066, "sampling/importance_sampling_ratio/min": 6.476825516680644e-05, "sampling/sampling_logp_difference/max": 3.0709586143493652, "sampling/sampling_logp_difference/mean": 0.005295285489410162, "step": 3190, "step_time": 10.210595141351224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 3000.0, "completions/max_terminated_length": 1923.0, "completions/mean_length": 1272.265625, "completions/mean_terminated_length": 442.1710510253906, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.019577998109161852, "epoch": 0.38461538461538464, "frac_reward_zero_std": 0.0, "grad_norm": 0.01075992826372385, "learning_rate": 6.155048076923076e-07, "loss": -0.0126, "num_tokens": 71638145.0, "reward": 0.6181862056255341, "reward_std": 0.3550487458705902, "rewards/reward_fn/mean": 0.6181862056255341, "rewards/reward_fn/std": 0.355048730969429, "sampling/importance_sampling_ratio/max": 1.4626011848449707, "sampling/importance_sampling_ratio/mean": 0.4293510168790817, "sampling/importance_sampling_ratio/min": 9.757138104760088e-05, "sampling/sampling_logp_difference/max": 3.0298818349838257, "sampling/sampling_logp_difference/mean": 0.004207453224807978, "step": 3200, "step_time": 7.317819740250707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3000.0, "completions/max_terminated_length": 1919.0, "completions/mean_length": 1102.0520833333333, "completions/mean_terminated_length": 476.7788899739583, "completions/min_length": 133.33333333333334, "completions/min_terminated_length": 133.33333333333334, "entropy": 0.023420874401926993, "epoch": 0.3858173076923077, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.005184351000934839, "learning_rate": 6.143028846153845e-07, "loss": -0.0076, "num_tokens": 71862086.0, "reward": 0.6965871055920919, "reward_std": 0.27214889228343964, "rewards/reward_fn/mean": 0.6965871055920919, "rewards/reward_fn/std": 0.27214889228343964, "sampling/importance_sampling_ratio/max": 1.2846963206926982, "sampling/importance_sampling_ratio/mean": 0.39174341162045795, "sampling/importance_sampling_ratio/min": 1.3321117118418139e-05, "sampling/sampling_logp_difference/max": 1.8353089491526287, "sampling/sampling_logp_difference/mean": 0.005076068143049876, "step": 3210, "step_time": 9.978717252518981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 3000.0, "completions/max_terminated_length": 2679.0, "completions/mean_length": 1952.25, "completions/mean_terminated_length": 1069.7582702636719, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.024180729500949384, "epoch": 0.3870192307692308, "frac_reward_zero_std": 0.0, "grad_norm": 0.004811166785657406, "learning_rate": 6.131009615384615e-07, "loss": 0.0028, "num_tokens": 72066022.0, "reward": 0.6726667881011963, "reward_std": 0.26679620146751404, "rewards/reward_fn/mean": 0.6726667881011963, "rewards/reward_fn/std": 0.26679619401693344, "sampling/importance_sampling_ratio/max": 1.4412954449653625, "sampling/importance_sampling_ratio/mean": 0.17980124801397324, "sampling/importance_sampling_ratio/min": 2.507147928554332e-05, "sampling/sampling_logp_difference/max": 2.4584391713142395, "sampling/sampling_logp_difference/mean": 0.004977440694347024, "step": 3220, "step_time": 7.15764893470332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2916666666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 1719.0, "completions/mean_length": 1207.8854166666667, "completions/mean_terminated_length": 476.7964274088542, "completions/min_length": 121.66666666666667, "completions/min_terminated_length": 121.66666666666667, "entropy": 0.023805302940309046, "epoch": 0.38822115384615385, "frac_reward_zero_std": 0.0, "grad_norm": 0.027513137087225914, "learning_rate": 6.118990384615384e-07, "loss": -0.006, "num_tokens": 72293275.0, "reward": 0.7121252417564392, "reward_std": 0.28546908497810364, "rewards/reward_fn/mean": 0.7121252417564392, "rewards/reward_fn/std": 0.28546907504399616, "sampling/importance_sampling_ratio/max": 1.4522273540496826, "sampling/importance_sampling_ratio/mean": 0.3626905679702759, "sampling/importance_sampling_ratio/min": 5.971249798146042e-05, "sampling/sampling_logp_difference/max": 5.019429524739583, "sampling/sampling_logp_difference/mean": 0.0053244892818232374, "step": 3230, "step_time": 10.334964302461595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 3000.0, "completions/max_terminated_length": 1779.5, "completions/mean_length": 710.109375, "completions/mean_terminated_length": 383.599609375, "completions/min_length": 110.5, "completions/min_terminated_length": 110.5, "entropy": 0.02309694644063711, "epoch": 0.3894230769230769, "frac_reward_zero_std": 0.125, "grad_norm": 0.039561230689287186, "learning_rate": 6.106971153846154e-07, "loss": -0.0064, "num_tokens": 72413978.0, "reward": 0.7191550135612488, "reward_std": 0.3063860833644867, "rewards/reward_fn/mean": 0.7191550135612488, "rewards/reward_fn/std": 0.3063860759139061, "sampling/importance_sampling_ratio/max": 1.936397135257721, "sampling/importance_sampling_ratio/mean": 0.559327244758606, "sampling/importance_sampling_ratio/min": 5.951116872893181e-05, "sampling/sampling_logp_difference/max": 2.7162503004074097, "sampling/sampling_logp_difference/mean": 0.005042055854573846, "step": 3240, "step_time": 6.897413563821464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.20833333333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 1912.6666666666667, "completions/mean_length": 989.1041666666666, "completions/mean_terminated_length": 492.69675699869794, "completions/min_length": 141.33333333333334, "completions/min_terminated_length": 141.33333333333334, "entropy": 0.022491187788546085, "epoch": 0.390625, "frac_reward_zero_std": 0.0, "grad_norm": 0.015202141366899014, "learning_rate": 6.094951923076923e-07, "loss": -0.012, "num_tokens": 72611340.0, "reward": 0.7048063278198242, "reward_std": 0.3003087391455968, "rewards/reward_fn/mean": 0.7048063278198242, "rewards/reward_fn/std": 0.3003087192773819, "sampling/importance_sampling_ratio/max": 1.6570007801055908, "sampling/importance_sampling_ratio/mean": 0.439024751385053, "sampling/importance_sampling_ratio/min": 0.000124565341184694, "sampling/sampling_logp_difference/max": 3.0353211959203086, "sampling/sampling_logp_difference/mean": 0.005344759362439315, "step": 3250, "step_time": 9.928270268719643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2419.0, "completions/mean_length": 655.078125, "completions/mean_terminated_length": 456.8408203125, "completions/min_length": 93.5, "completions/min_terminated_length": 93.5, "entropy": 0.0268368499353528, "epoch": 0.3918269230769231, "frac_reward_zero_std": 0.0, "grad_norm": 0.04565397650003433, "learning_rate": 6.082932692307692e-07, "loss": 0.0021, "num_tokens": 72706313.0, "reward": 0.8344680070877075, "reward_std": 0.16805239021778107, "rewards/reward_fn/mean": 0.8344680070877075, "rewards/reward_fn/std": 0.16805238276720047, "sampling/importance_sampling_ratio/max": 2.0348532795906067, "sampling/importance_sampling_ratio/mean": 0.5363818109035492, "sampling/importance_sampling_ratio/min": 0.00025148088207060937, "sampling/sampling_logp_difference/max": 1.9113219380378723, "sampling/sampling_logp_difference/mean": 0.005941193550825119, "step": 3260, "step_time": 6.448729178030044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3541666666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 2119.6666666666665, "completions/mean_length": 1446.6979166666667, "completions/mean_terminated_length": 610.4071858723959, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.02155132740736008, "epoch": 0.39302884615384615, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.022562379017472267, "learning_rate": 6.070913461538462e-07, "loss": -0.0093, "num_tokens": 72989292.0, "reward": 0.6040774981180826, "reward_std": 0.31436235706011456, "rewards/reward_fn/mean": 0.6040774981180826, "rewards/reward_fn/std": 0.3143623371918996, "sampling/importance_sampling_ratio/max": 1.9877144893010457, "sampling/importance_sampling_ratio/mean": 0.3495716055234273, "sampling/importance_sampling_ratio/min": 0.0001987565128622748, "sampling/sampling_logp_difference/max": 3.420247793197632, "sampling/sampling_logp_difference/mean": 0.00530534191057086, "step": 3270, "step_time": 11.414463269710541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2154.5, "completions/mean_length": 1247.171875, "completions/mean_terminated_length": 577.8423156738281, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.024909759499132635, "epoch": 0.3942307692307692, "frac_reward_zero_std": 0.0, "grad_norm": 0.009168716147542, "learning_rate": 6.05889423076923e-07, "loss": -0.0065, "num_tokens": 73134407.0, "reward": 0.7074766159057617, "reward_std": 0.2745952010154724, "rewards/reward_fn/mean": 0.7074766159057617, "rewards/reward_fn/std": 0.2745952159166336, "sampling/importance_sampling_ratio/max": 1.4629302024841309, "sampling/importance_sampling_ratio/mean": 0.335781529545784, "sampling/importance_sampling_ratio/min": 0.00039979227221920155, "sampling/sampling_logp_difference/max": 1.5386531352996826, "sampling/sampling_logp_difference/mean": 0.005170842166990042, "step": 3280, "step_time": 7.181604312360287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.20833333333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 2180.3333333333335, "completions/mean_length": 1056.2395833333333, "completions/mean_terminated_length": 589.1602986653646, "completions/min_length": 145.33333333333334, "completions/min_terminated_length": 145.33333333333334, "entropy": 0.023685162514448167, "epoch": 0.3954326923076923, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.014270702376961708, "learning_rate": 6.046875e-07, "loss": -0.0001, "num_tokens": 73331734.0, "reward": 0.7904730836550394, "reward_std": 0.2014190802971522, "rewards/reward_fn/mean": 0.7904730836550394, "rewards/reward_fn/std": 0.201419065395991, "sampling/importance_sampling_ratio/max": 1.3105170726776123, "sampling/importance_sampling_ratio/mean": 0.4369664291540782, "sampling/importance_sampling_ratio/min": 0.0003508894111898068, "sampling/sampling_logp_difference/max": 1.7872824668884277, "sampling/sampling_logp_difference/mean": 0.004899086120227973, "step": 3290, "step_time": 10.029705266002566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3000.0, "completions/max_terminated_length": 2425.5, "completions/mean_length": 1134.140625, "completions/mean_terminated_length": 525.5476226806641, "completions/min_length": 136.5, "completions/min_terminated_length": 136.5, "entropy": 0.019576493836939334, "epoch": 0.39663461538461536, "frac_reward_zero_std": 0.25, "grad_norm": 0.0018278742209076881, "learning_rate": 6.034855769230769e-07, "loss": -0.0014, "num_tokens": 73477919.0, "reward": 0.5257045030593872, "reward_std": 0.3787413537502289, "rewards/reward_fn/mean": 0.5257045030593872, "rewards/reward_fn/std": 0.3787413686513901, "sampling/importance_sampling_ratio/max": 1.3678673803806305, "sampling/importance_sampling_ratio/mean": 0.44077539443969727, "sampling/importance_sampling_ratio/min": 3.4658551840038854e-05, "sampling/sampling_logp_difference/max": 1.6687725186347961, "sampling/sampling_logp_difference/mean": 0.004159011645242572, "step": 3300, "step_time": 7.178349936939776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3000.0, "completions/max_terminated_length": 2124.6666666666665, "completions/mean_length": 1191.7604166666667, "completions/mean_terminated_length": 593.2217814127604, "completions/min_length": 131.66666666666666, "completions/min_terminated_length": 131.66666666666666, "entropy": 0.022496035881340505, "epoch": 0.39783653846153844, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.00835899356752634, "learning_rate": 6.022836538461539e-07, "loss": -0.0058, "num_tokens": 73708000.0, "reward": 0.7627103726069132, "reward_std": 0.2326513727506002, "rewards/reward_fn/mean": 0.7627103726069132, "rewards/reward_fn/std": 0.2326513727506002, "sampling/importance_sampling_ratio/max": 1.3869497974713643, "sampling/importance_sampling_ratio/mean": 0.3754725654919942, "sampling/importance_sampling_ratio/min": 6.018027488607913e-05, "sampling/sampling_logp_difference/max": 3.809593439102173, "sampling/sampling_logp_difference/mean": 0.005176423117518425, "step": 3310, "step_time": 10.041337585356086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 3000.0, "completions/max_terminated_length": 1137.5, "completions/mean_length": 1227.359375, "completions/mean_terminated_length": 429.5, "completions/min_length": 176.5, "completions/min_terminated_length": 176.5, "entropy": 0.020709970127791168, "epoch": 0.39903846153846156, "frac_reward_zero_std": 0.125, "grad_norm": 0.012180306017398834, "learning_rate": 6.010817307692307e-07, "loss": 0.0148, "num_tokens": 73853839.0, "reward": 0.5285801440477371, "reward_std": 0.3374328911304474, "rewards/reward_fn/mean": 0.5285801440477371, "rewards/reward_fn/std": 0.3374328911304474, "sampling/importance_sampling_ratio/max": 1.0690841674804688, "sampling/importance_sampling_ratio/mean": 0.402814045548439, "sampling/importance_sampling_ratio/min": 1.8420410924591124e-05, "sampling/sampling_logp_difference/max": 1.8673403859138489, "sampling/sampling_logp_difference/mean": 0.0038793663261458278, "step": 3320, "step_time": 7.145859755109996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4270833333333333, "completions/max_length": 3000.0, "completions/max_terminated_length": 2580.0, "completions/mean_length": 1662.1875, "completions/mean_terminated_length": 679.5505574544271, "completions/min_length": 171.66666666666666, "completions/min_terminated_length": 171.66666666666666, "entropy": 0.023307993169873954, "epoch": 0.40024038461538464, "frac_reward_zero_std": 0.0, "grad_norm": 0.0013347248313948512, "learning_rate": 5.998798076923076e-07, "loss": -0.0016, "num_tokens": 74131145.0, "reward": 0.6570606629053751, "reward_std": 0.27711645762125653, "rewards/reward_fn/mean": 0.6570606629053751, "rewards/reward_fn/std": 0.27711646755536395, "sampling/importance_sampling_ratio/max": 1.3859057029088337, "sampling/importance_sampling_ratio/mean": 0.20814481874306998, "sampling/importance_sampling_ratio/min": 4.712784486097613e-05, "sampling/sampling_logp_difference/max": 3.012236555417379, "sampling/sampling_logp_difference/mean": 0.004976691212505102, "step": 3330, "step_time": 10.556273089721799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2508.0, "completions/mean_length": 1433.546875, "completions/mean_terminated_length": 610.284912109375, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.023911891132593156, "epoch": 0.4014423076923077, "frac_reward_zero_std": 0.0, "grad_norm": 0.008500785566866398, "learning_rate": 5.986778846153846e-07, "loss": 0.0103, "num_tokens": 74294372.0, "reward": 0.6481566727161407, "reward_std": 0.2961992919445038, "rewards/reward_fn/mean": 0.6481566727161407, "rewards/reward_fn/std": 0.296199306845665, "sampling/importance_sampling_ratio/max": 1.1356995701789856, "sampling/importance_sampling_ratio/mean": 0.2996499538421631, "sampling/importance_sampling_ratio/min": 7.95454946000973e-06, "sampling/sampling_logp_difference/max": 3.3233344554901123, "sampling/sampling_logp_difference/mean": 0.005443903151899576, "step": 3340, "step_time": 7.0349366588518025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3000.0, "completions/max_terminated_length": 1586.3333333333333, "completions/mean_length": 1068.34375, "completions/mean_terminated_length": 431.4220886230469, "completions/min_length": 97.33333333333333, "completions/min_terminated_length": 97.33333333333333, "entropy": 0.02521772775799036, "epoch": 0.4026442307692308, "frac_reward_zero_std": 0.0, "grad_norm": 0.035183537751436234, "learning_rate": 5.974759615384614e-07, "loss": -0.0054, "num_tokens": 74509853.0, "reward": 0.734611988067627, "reward_std": 0.26725097994009656, "rewards/reward_fn/mean": 0.734611988067627, "rewards/reward_fn/std": 0.26725097994009656, "sampling/importance_sampling_ratio/max": 1.606063961982727, "sampling/importance_sampling_ratio/mean": 0.38456633190313977, "sampling/importance_sampling_ratio/min": 0.0001537244200638573, "sampling/sampling_logp_difference/max": 3.5139400164286294, "sampling/sampling_logp_difference/mean": 0.005370703991502523, "step": 3350, "step_time": 10.099865398462862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 3000.0, "completions/max_terminated_length": 2209.0, "completions/mean_length": 1610.921875, "completions/mean_terminated_length": 598.4400634765625, "completions/min_length": 135.5, "completions/min_terminated_length": 135.5, "entropy": 0.022738316282629966, "epoch": 0.40384615384615385, "frac_reward_zero_std": 0.0, "grad_norm": 0.011893631890416145, "learning_rate": 5.962740384615385e-07, "loss": 0.0066, "num_tokens": 74690992.0, "reward": 0.6346395015716553, "reward_std": 0.29637470841407776, "rewards/reward_fn/mean": 0.6346395015716553, "rewards/reward_fn/std": 0.29637469351291656, "sampling/importance_sampling_ratio/max": 2.1624813079833984, "sampling/importance_sampling_ratio/mean": 0.3512013405561447, "sampling/importance_sampling_ratio/min": 5.624097138934303e-05, "sampling/sampling_logp_difference/max": 2.7108432054519653, "sampling/sampling_logp_difference/mean": 0.004150787368416786, "step": 3360, "step_time": 7.034767959080637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.17708333333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 2239.3333333333335, "completions/mean_length": 1019.4583333333334, "completions/mean_terminated_length": 610.7914632161459, "completions/min_length": 128.66666666666666, "completions/min_terminated_length": 128.66666666666666, "entropy": 0.025721309520304204, "epoch": 0.4050480769230769, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.009225219488143921, "learning_rate": 5.950721153846153e-07, "loss": -0.0038, "num_tokens": 74892716.0, "reward": 0.6448699037233988, "reward_std": 0.339739054441452, "rewards/reward_fn/mean": 0.6448699037233988, "rewards/reward_fn/std": 0.339739054441452, "sampling/importance_sampling_ratio/max": 1.6114559173583984, "sampling/importance_sampling_ratio/mean": 0.37241390347480774, "sampling/importance_sampling_ratio/min": 9.160226060582015e-05, "sampling/sampling_logp_difference/max": 2.6921645402908325, "sampling/sampling_logp_difference/mean": 0.005333241385718186, "step": 3370, "step_time": 10.208537295553834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2138.5, "completions/mean_length": 1395.890625, "completions/mean_terminated_length": 762.1783752441406, "completions/min_length": 166.5, "completions/min_terminated_length": 166.5, "entropy": 0.024416710250079632, "epoch": 0.40625, "frac_reward_zero_std": 0.0, "grad_norm": 0.007045926991850138, "learning_rate": 5.938701923076924e-07, "loss": 0.002, "num_tokens": 75057325.0, "reward": 0.7194348275661469, "reward_std": 0.2627021297812462, "rewards/reward_fn/mean": 0.7194348275661469, "rewards/reward_fn/std": 0.2627021297812462, "sampling/importance_sampling_ratio/max": 1.4999111890792847, "sampling/importance_sampling_ratio/mean": 0.27668825536966324, "sampling/importance_sampling_ratio/min": 0.00011463032569736242, "sampling/sampling_logp_difference/max": 2.8567575216293335, "sampling/sampling_logp_difference/mean": 0.005570182343944907, "step": 3380, "step_time": 7.190138670243323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3229166666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 1913.3333333333333, "completions/mean_length": 1311.2083333333333, "completions/mean_terminated_length": 506.9374084472656, "completions/min_length": 129.66666666666666, "completions/min_terminated_length": 129.66666666666666, "entropy": 0.02560083381831646, "epoch": 0.4074519230769231, "frac_reward_zero_std": 0.0, "grad_norm": 0.006236980203539133, "learning_rate": 5.926682692307692e-07, "loss": 0.0157, "num_tokens": 75285321.0, "reward": 0.6951219240824381, "reward_std": 0.27615711092948914, "rewards/reward_fn/mean": 0.6951219240824381, "rewards/reward_fn/std": 0.2761571059624354, "sampling/importance_sampling_ratio/max": 1.4554156064987183, "sampling/importance_sampling_ratio/mean": 0.3072776993115743, "sampling/importance_sampling_ratio/min": 4.3993364765052924e-05, "sampling/sampling_logp_difference/max": 4.0014082590738935, "sampling/sampling_logp_difference/mean": 0.005990742705762386, "step": 3390, "step_time": 10.155658295098693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 3000.0, "completions/max_terminated_length": 1762.0, "completions/mean_length": 1498.46875, "completions/mean_terminated_length": 767.9480590820312, "completions/min_length": 144.5, "completions/min_terminated_length": 144.5, "entropy": 0.023706499859690665, "epoch": 0.40865384615384615, "frac_reward_zero_std": 0.0, "grad_norm": 0.010176315903663635, "learning_rate": 5.914663461538461e-07, "loss": -0.0058, "num_tokens": 75453487.0, "reward": 0.6791880130767822, "reward_std": 0.31277525424957275, "rewards/reward_fn/mean": 0.6791880130767822, "rewards/reward_fn/std": 0.31277525424957275, "sampling/importance_sampling_ratio/max": 1.4474551677703857, "sampling/importance_sampling_ratio/mean": 0.2631193995475769, "sampling/importance_sampling_ratio/min": 2.6520614483160898e-05, "sampling/sampling_logp_difference/max": 2.539669632911682, "sampling/sampling_logp_difference/mean": 0.0050580850802361965, "step": 3400, "step_time": 7.144252789765597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 3000.0, "completions/max_terminated_length": 1691.3333333333333, "completions/mean_length": 1444.5416666666667, "completions/mean_terminated_length": 527.6913045247396, "completions/min_length": 157.66666666666666, "completions/min_terminated_length": 157.66666666666666, "entropy": 0.02463779505342245, "epoch": 0.4098557692307692, "frac_reward_zero_std": 0.0, "grad_norm": 0.01736047863960266, "learning_rate": 5.902644230769231e-07, "loss": 0.0071, "num_tokens": 75698483.0, "reward": 0.7156394720077515, "reward_std": 0.24589535097281137, "rewards/reward_fn/mean": 0.7156394720077515, "rewards/reward_fn/std": 0.24589534600575766, "sampling/importance_sampling_ratio/max": 1.5377857287724812, "sampling/importance_sampling_ratio/mean": 0.31922340393066406, "sampling/importance_sampling_ratio/min": 6.597523012411936e-05, "sampling/sampling_logp_difference/max": 2.158271908760071, "sampling/sampling_logp_difference/mean": 0.005035793253531058, "step": 3410, "step_time": 10.423028814513236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2333.5, "completions/mean_length": 1084.40625, "completions/mean_terminated_length": 597.423095703125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.021439247764647007, "epoch": 0.4110576923076923, "frac_reward_zero_std": 0.0, "grad_norm": 0.03008987009525299, "learning_rate": 5.890625e-07, "loss": -0.004, "num_tokens": 75879541.0, "reward": 0.7500665783882141, "reward_std": 0.2724810987710953, "rewards/reward_fn/mean": 0.7500665783882141, "rewards/reward_fn/std": 0.2724811062216759, "sampling/importance_sampling_ratio/max": 1.368022084236145, "sampling/importance_sampling_ratio/mean": 0.41893370449543, "sampling/importance_sampling_ratio/min": 5.4419441366526655e-06, "sampling/sampling_logp_difference/max": 6.175653278827667, "sampling/sampling_logp_difference/mean": 0.005219755927100778, "step": 3420, "step_time": 8.337956412415952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.23958333333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 2118.0, "completions/mean_length": 1112.71875, "completions/mean_terminated_length": 524.3428853352865, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.023540368676185607, "epoch": 0.41225961538461536, "frac_reward_zero_std": 0.0, "grad_norm": 0.006530220620334148, "learning_rate": 5.878605769230769e-07, "loss": -0.005, "num_tokens": 76073266.0, "reward": 0.6751161615053812, "reward_std": 0.3019627034664154, "rewards/reward_fn/mean": 0.6751161615053812, "rewards/reward_fn/std": 0.3019626984993617, "sampling/importance_sampling_ratio/max": 1.2551290194193523, "sampling/importance_sampling_ratio/mean": 0.4043087462584178, "sampling/importance_sampling_ratio/min": 7.283160812221467e-05, "sampling/sampling_logp_difference/max": 2.2511088053385415, "sampling/sampling_logp_difference/mean": 0.005424715268115203, "step": 3430, "step_time": 10.090649201348423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 3000.0, "completions/max_terminated_length": 1731.5, "completions/mean_length": 1464.109375, "completions/mean_terminated_length": 489.26588439941406, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.02108424101024866, "epoch": 0.41346153846153844, "frac_reward_zero_std": 0.0, "grad_norm": 0.01573832891881466, "learning_rate": 5.866586538461539e-07, "loss": -0.0049, "num_tokens": 76223881.0, "reward": 0.6061355471611023, "reward_std": 0.32128433883190155, "rewards/reward_fn/mean": 0.6061355471611023, "rewards/reward_fn/std": 0.32128433883190155, "sampling/importance_sampling_ratio/max": 1.2738459706306458, "sampling/importance_sampling_ratio/mean": 0.32493577897548676, "sampling/importance_sampling_ratio/min": 1.0426894505144446e-05, "sampling/sampling_logp_difference/max": 2.448888421058655, "sampling/sampling_logp_difference/mean": 0.004801162518560886, "step": 3440, "step_time": 7.222398634441197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2708333333333333, "completions/max_length": 3000.0, "completions/max_terminated_length": 2077.3333333333335, "completions/mean_length": 1173.9583333333333, "completions/mean_terminated_length": 497.6079508463542, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.023710201680660247, "epoch": 0.41466346153846156, "frac_reward_zero_std": 0.0, "grad_norm": 0.008106960915029049, "learning_rate": 5.854567307692307e-07, "loss": -0.0034, "num_tokens": 76455021.0, "reward": 0.7465506990750631, "reward_std": 0.21862994134426117, "rewards/reward_fn/mean": 0.7465506990750631, "rewards/reward_fn/std": 0.21862993637720743, "sampling/importance_sampling_ratio/max": 1.7496483325958252, "sampling/importance_sampling_ratio/mean": 0.3847012420495351, "sampling/importance_sampling_ratio/min": 2.4748454810226878e-05, "sampling/sampling_logp_difference/max": 2.5473947127660117, "sampling/sampling_logp_difference/mean": 0.005278619782378276, "step": 3450, "step_time": 10.103426533937455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 3000.0, "completions/max_terminated_length": 1851.0, "completions/mean_length": 1376.484375, "completions/mean_terminated_length": 643.5292053222656, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.024947309494018556, "epoch": 0.41586538461538464, "frac_reward_zero_std": 0.0, "grad_norm": 0.007265259511768818, "learning_rate": 5.842548076923076e-07, "loss": -0.0044, "num_tokens": 76624148.0, "reward": 0.7121067345142365, "reward_std": 0.26675935834646225, "rewards/reward_fn/mean": 0.7121067345142365, "rewards/reward_fn/std": 0.26675935089588165, "sampling/importance_sampling_ratio/max": 1.469472885131836, "sampling/importance_sampling_ratio/mean": 0.2659680098295212, "sampling/importance_sampling_ratio/min": 9.763701018528081e-05, "sampling/sampling_logp_difference/max": 1.9395884275436401, "sampling/sampling_logp_difference/mean": 0.005234456388279796, "step": 3460, "step_time": 7.231776522099972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.20833333333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 1651.0, "completions/mean_length": 940.90625, "completions/mean_terminated_length": 398.35845947265625, "completions/min_length": 111.33333333333333, "completions/min_terminated_length": 111.33333333333333, "entropy": 0.019361827336251736, "epoch": 0.4170673076923077, "frac_reward_zero_std": 0.16666666666666666, "grad_norm": 0.008943376131355762, "learning_rate": 5.830528846153846e-07, "loss": -0.0011, "num_tokens": 76810547.0, "reward": 0.6010483702023824, "reward_std": 0.3099573055903117, "rewards/reward_fn/mean": 0.6010483702023824, "rewards/reward_fn/std": 0.30995731552441913, "sampling/importance_sampling_ratio/max": 1.2961880366007488, "sampling/importance_sampling_ratio/mean": 0.4727041920026143, "sampling/importance_sampling_ratio/min": 0.00013069959459244274, "sampling/sampling_logp_difference/max": 2.3370759089787803, "sampling/sampling_logp_difference/mean": 0.0043457115534693, "step": 3470, "step_time": 10.166615448053927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 3000.0, "completions/max_terminated_length": 2433.5, "completions/mean_length": 1628.390625, "completions/mean_terminated_length": 701.3027801513672, "completions/min_length": 166.5, "completions/min_terminated_length": 166.5, "entropy": 0.01880371980369091, "epoch": 0.4182692307692308, "frac_reward_zero_std": 0.0, "grad_norm": 0.017684945836663246, "learning_rate": 5.818509615384615e-07, "loss": -0.0048, "num_tokens": 76984180.0, "reward": 0.6718080043792725, "reward_std": 0.2764498442411423, "rewards/reward_fn/mean": 0.6718080043792725, "rewards/reward_fn/std": 0.2764498442411423, "sampling/importance_sampling_ratio/max": 1.7239073514938354, "sampling/importance_sampling_ratio/mean": 0.2680779993534088, "sampling/importance_sampling_ratio/min": 0.0001407775762345409, "sampling/sampling_logp_difference/max": 2.8670281171798706, "sampling/sampling_logp_difference/mean": 0.00512497522868216, "step": 3480, "step_time": 7.035339066479355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.19791666666666666, "completions/max_length": 3000.0, "completions/max_terminated_length": 1917.6666666666667, "completions/mean_length": 1064.03125, "completions/mean_terminated_length": 610.7239685058594, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.0243636904284358, "epoch": 0.41947115384615385, "frac_reward_zero_std": 0.0, "grad_norm": 0.003915412351489067, "learning_rate": 5.806490384615384e-07, "loss": -0.001, "num_tokens": 77193231.0, "reward": 0.7463187774022421, "reward_std": 0.2558238257964452, "rewards/reward_fn/mean": 0.7463187774022421, "rewards/reward_fn/std": 0.2558238257964452, "sampling/importance_sampling_ratio/max": 1.0928898255030315, "sampling/importance_sampling_ratio/mean": 0.31967905163764954, "sampling/importance_sampling_ratio/min": 0.0007387381742015956, "sampling/sampling_logp_difference/max": 2.2277561028798423, "sampling/sampling_logp_difference/mean": 0.005506620276719332, "step": 3490, "step_time": 10.117306559719145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2444.5, "completions/mean_length": 1727.609375, "completions/mean_terminated_length": 727.986083984375, "completions/min_length": 184.5, "completions/min_terminated_length": 184.5, "entropy": 0.02281710673123598, "epoch": 0.4206730769230769, "frac_reward_zero_std": 0.0, "grad_norm": 0.002628213493153453, "learning_rate": 5.794471153846154e-07, "loss": -0.0066, "num_tokens": 77374318.0, "reward": 0.6671364903450012, "reward_std": 0.2681064233183861, "rewards/reward_fn/mean": 0.6671364903450012, "rewards/reward_fn/std": 0.2681064307689667, "sampling/importance_sampling_ratio/max": 1.1770579814910889, "sampling/importance_sampling_ratio/mean": 0.24292564392089844, "sampling/importance_sampling_ratio/min": 3.937201568177866e-07, "sampling/sampling_logp_difference/max": 15.74197006225586, "sampling/sampling_logp_difference/mean": 0.004748739534988999, "step": 3500, "step_time": 7.242463867738843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.23958333333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 2441.3333333333335, "completions/mean_length": 1180.71875, "completions/mean_terminated_length": 614.1310221354166, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.023499294836074113, "epoch": 0.421875, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.0046119107864797115, "learning_rate": 5.782451923076923e-07, "loss": -0.0052, "num_tokens": 77590627.0, "reward": 0.6598550180594126, "reward_std": 0.3111528952916463, "rewards/reward_fn/mean": 0.6598550180594126, "rewards/reward_fn/std": 0.3111528903245926, "sampling/importance_sampling_ratio/max": 1.6802875399589539, "sampling/importance_sampling_ratio/mean": 0.3453800678253174, "sampling/importance_sampling_ratio/min": 6.805577747096929e-05, "sampling/sampling_logp_difference/max": 1.489088495572408, "sampling/sampling_logp_difference/mean": 0.00512530740040044, "step": 3510, "step_time": 10.198937508184462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2318.0, "completions/mean_length": 1479.859375, "completions/mean_terminated_length": 550.1551818847656, "completions/min_length": 140.5, "completions/min_terminated_length": 140.5, "entropy": 0.021905313059687615, "epoch": 0.4230769230769231, "frac_reward_zero_std": 0.0, "grad_norm": 0.012350926175713539, "learning_rate": 5.770432692307691e-07, "loss": 0.0224, "num_tokens": 77766538.0, "reward": 0.6070539355278015, "reward_std": 0.34295251965522766, "rewards/reward_fn/mean": 0.6070539355278015, "rewards/reward_fn/std": 0.34295251965522766, "sampling/importance_sampling_ratio/max": 1.9475604891777039, "sampling/importance_sampling_ratio/mean": 0.4487265348434448, "sampling/importance_sampling_ratio/min": 0.00011416003962949617, "sampling/sampling_logp_difference/max": 5.311800241470337, "sampling/sampling_logp_difference/mean": 0.004485845798626542, "step": 3520, "step_time": 6.97854871628806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2916666666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 1927.6666666666667, "completions/mean_length": 1224.96875, "completions/mean_terminated_length": 477.85303751627606, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.02410754971206188, "epoch": 0.42427884615384615, "frac_reward_zero_std": 0.0, "grad_norm": 0.011043887585401535, "learning_rate": 5.758413461538462e-07, "loss": -0.0034, "num_tokens": 77986007.0, "reward": 0.6922411123911539, "reward_std": 0.2628527233997981, "rewards/reward_fn/mean": 0.6922411123911539, "rewards/reward_fn/std": 0.2628527233997981, "sampling/importance_sampling_ratio/max": 1.2370970646540325, "sampling/importance_sampling_ratio/mean": 0.38113417228062946, "sampling/importance_sampling_ratio/min": 4.9677151158296816e-05, "sampling/sampling_logp_difference/max": 1.9669562578201294, "sampling/sampling_logp_difference/mean": 0.005158823604385058, "step": 3530, "step_time": 10.225607239827514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2427.0, "completions/mean_length": 931.0, "completions/mean_terminated_length": 405.10894775390625, "completions/min_length": 114.5, "completions/min_terminated_length": 114.5, "entropy": 0.02220838498324156, "epoch": 0.4254807692307692, "frac_reward_zero_std": 0.125, "grad_norm": 0.0061986870132386684, "learning_rate": 5.74639423076923e-07, "loss": -0.0057, "num_tokens": 78122903.0, "reward": 0.7751936912536621, "reward_std": 0.21945498883724213, "rewards/reward_fn/mean": 0.7751936912536621, "rewards/reward_fn/std": 0.21945497393608093, "sampling/importance_sampling_ratio/max": 1.9106862545013428, "sampling/importance_sampling_ratio/mean": 0.49202826619148254, "sampling/importance_sampling_ratio/min": 0.00012700442312052473, "sampling/sampling_logp_difference/max": 2.1808817386627197, "sampling/sampling_logp_difference/mean": 0.0047972508473321795, "step": 3540, "step_time": 6.927139577548951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.23958333333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 2207.0, "completions/mean_length": 1075.1770833333333, "completions/mean_terminated_length": 462.5880432128906, "completions/min_length": 117.66666666666667, "completions/min_terminated_length": 117.66666666666667, "entropy": 0.02428277637809515, "epoch": 0.4266826923076923, "frac_reward_zero_std": 0.0, "grad_norm": 0.029201306402683258, "learning_rate": 5.734375000000001e-07, "loss": -0.0006, "num_tokens": 78341528.0, "reward": 0.687649647394816, "reward_std": 0.30549635489781696, "rewards/reward_fn/mean": 0.687649647394816, "rewards/reward_fn/std": 0.3054963747660319, "sampling/importance_sampling_ratio/max": 1.6248629093170166, "sampling/importance_sampling_ratio/mean": 0.399588276942571, "sampling/importance_sampling_ratio/min": 0.00034932322414723177, "sampling/sampling_logp_difference/max": 2.1963109970092773, "sampling/sampling_logp_difference/mean": 0.0056671334120134515, "step": 3550, "step_time": 10.354111399315297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2386.5, "completions/mean_length": 1093.3125, "completions/mean_terminated_length": 637.5736694335938, "completions/min_length": 133.5, "completions/min_terminated_length": 133.5, "entropy": 0.028055174462497234, "epoch": 0.42788461538461536, "frac_reward_zero_std": 0.0, "grad_norm": 0.002275680424645543, "learning_rate": 5.722355769230769e-07, "loss": -0.002, "num_tokens": 78484372.0, "reward": 0.7823580205440521, "reward_std": 0.20949815213680267, "rewards/reward_fn/mean": 0.7823580205440521, "rewards/reward_fn/std": 0.20949813723564148, "sampling/importance_sampling_ratio/max": 0.9744383990764618, "sampling/importance_sampling_ratio/mean": 0.3308398872613907, "sampling/importance_sampling_ratio/min": 0.0019498620849844883, "sampling/sampling_logp_difference/max": 2.6812245845794678, "sampling/sampling_logp_difference/mean": 0.005758732790127397, "step": 3560, "step_time": 7.090542422607541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3645833333333333, "completions/max_length": 3000.0, "completions/max_terminated_length": 2582.3333333333335, "completions/mean_length": 1484.6041666666667, "completions/mean_terminated_length": 616.0420735677084, "completions/min_length": 160.33333333333334, "completions/min_terminated_length": 160.33333333333334, "entropy": 0.02330990582704544, "epoch": 0.42908653846153844, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.004376153461635113, "learning_rate": 5.710336538461538e-07, "loss": -0.0003, "num_tokens": 78737046.0, "reward": 0.5947919885317484, "reward_std": 0.3278126021226247, "rewards/reward_fn/mean": 0.5947919885317484, "rewards/reward_fn/std": 0.32781259218851727, "sampling/importance_sampling_ratio/max": 1.5866297483444214, "sampling/importance_sampling_ratio/mean": 0.3272210955619812, "sampling/importance_sampling_ratio/min": 7.760652806609869e-06, "sampling/sampling_logp_difference/max": 2.9429765144983926, "sampling/sampling_logp_difference/mean": 0.005182072675476472, "step": 3570, "step_time": 10.539972108323127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3000.0, "completions/max_terminated_length": 2440.5, "completions/mean_length": 1300.046875, "completions/mean_terminated_length": 724.8536071777344, "completions/min_length": 157.5, "completions/min_terminated_length": 157.5, "entropy": 0.022509184665977955, "epoch": 0.43028846153846156, "frac_reward_zero_std": 0.0, "grad_norm": 0.01754099316895008, "learning_rate": 5.698317307692308e-07, "loss": 0.003, "num_tokens": 78892521.0, "reward": 0.7661837935447693, "reward_std": 0.2106526419520378, "rewards/reward_fn/mean": 0.7661837935447693, "rewards/reward_fn/std": 0.21065263450145721, "sampling/importance_sampling_ratio/max": 1.8959749937057495, "sampling/importance_sampling_ratio/mean": 0.3733685314655304, "sampling/importance_sampling_ratio/min": 0.003775778179488043, "sampling/sampling_logp_difference/max": 2.444231390953064, "sampling/sampling_logp_difference/mean": 0.005374192260205746, "step": 3580, "step_time": 7.0149135740473865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 3000.0, "completions/max_terminated_length": 1909.3333333333333, "completions/mean_length": 1144.7083333333333, "completions/mean_terminated_length": 443.0614420572917, "completions/min_length": 132.66666666666666, "completions/min_terminated_length": 132.66666666666666, "entropy": 0.025140822120010852, "epoch": 0.43149038461538464, "frac_reward_zero_std": 0.0, "grad_norm": 0.025491734966635704, "learning_rate": 5.686298076923076e-07, "loss": -0.0037, "num_tokens": 79117445.0, "reward": 0.7042335073153178, "reward_std": 0.25338566303253174, "rewards/reward_fn/mean": 0.7042335073153178, "rewards/reward_fn/std": 0.253385658065478, "sampling/importance_sampling_ratio/max": 2.0527181228001914, "sampling/importance_sampling_ratio/mean": 0.4430508514245351, "sampling/importance_sampling_ratio/min": 0.0001598300060929129, "sampling/sampling_logp_difference/max": 1.6197961966196697, "sampling/sampling_logp_difference/mean": 0.005409353567908208, "step": 3590, "step_time": 10.339907584432513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2295.5, "completions/mean_length": 1419.796875, "completions/mean_terminated_length": 590.7151184082031, "completions/min_length": 167.5, "completions/min_terminated_length": 167.5, "entropy": 0.023886379785835743, "epoch": 0.4326923076923077, "frac_reward_zero_std": 0.0, "grad_norm": 0.009264661930501461, "learning_rate": 5.674278846153846e-07, "loss": 0.0021, "num_tokens": 79308368.0, "reward": 0.6464300751686096, "reward_std": 0.2849503308534622, "rewards/reward_fn/mean": 0.6464300751686096, "rewards/reward_fn/std": 0.2849503308534622, "sampling/importance_sampling_ratio/max": 1.4416422247886658, "sampling/importance_sampling_ratio/mean": 0.306487038731575, "sampling/importance_sampling_ratio/min": 1.9597030927798187e-05, "sampling/sampling_logp_difference/max": 2.4817216396331787, "sampling/sampling_logp_difference/mean": 0.004861527355387807, "step": 3600, "step_time": 7.233052104711533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2708333333333333, "completions/max_length": 3000.0, "completions/max_terminated_length": 2067.0, "completions/mean_length": 1108.5416666666667, "completions/mean_terminated_length": 406.81614176432294, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.020145629346370698, "epoch": 0.4338942307692308, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.0051310122944414616, "learning_rate": 5.662259615384615e-07, "loss": 0.0023, "num_tokens": 79514156.0, "reward": 0.680109977722168, "reward_std": 0.29432761172453564, "rewards/reward_fn/mean": 0.680109977722168, "rewards/reward_fn/std": 0.29432761172453564, "sampling/importance_sampling_ratio/max": 1.5850738684336345, "sampling/importance_sampling_ratio/mean": 0.4380357066790263, "sampling/importance_sampling_ratio/min": 6.26371139181477e-05, "sampling/sampling_logp_difference/max": 4.703222592671712, "sampling/sampling_logp_difference/mean": 0.004998302242408196, "step": 3610, "step_time": 10.370156747754663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2594.5, "completions/mean_length": 1176.203125, "completions/mean_terminated_length": 617.5351257324219, "completions/min_length": 143.5, "completions/min_terminated_length": 143.5, "entropy": 0.023980528861284257, "epoch": 0.43509615384615385, "frac_reward_zero_std": 0.125, "grad_norm": 0.011574736796319485, "learning_rate": 5.650240384615385e-07, "loss": 0.0069, "num_tokens": 79657433.0, "reward": 0.6468505263328552, "reward_std": 0.30850259214639664, "rewards/reward_fn/mean": 0.6468505263328552, "rewards/reward_fn/std": 0.30850259214639664, "sampling/importance_sampling_ratio/max": 1.2619696855545044, "sampling/importance_sampling_ratio/mean": 0.3650178164243698, "sampling/importance_sampling_ratio/min": 0.0001697068273642799, "sampling/sampling_logp_difference/max": 3.281561255455017, "sampling/sampling_logp_difference/mean": 0.004551101243123412, "step": 3620, "step_time": 7.133866872172803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 3000.0, "completions/max_terminated_length": 1551.6666666666667, "completions/mean_length": 626.6979166666666, "completions/mean_terminated_length": 379.43650309244794, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.023648987524211407, "epoch": 0.4362980769230769, "frac_reward_zero_std": 0.0, "grad_norm": 0.0912761464715004, "learning_rate": 5.638221153846153e-07, "loss": 0.0341, "num_tokens": 79812588.0, "reward": 0.8511076172192892, "reward_std": 0.15026502559582391, "rewards/reward_fn/mean": 0.8511076172192892, "rewards/reward_fn/std": 0.15026501814524332, "sampling/importance_sampling_ratio/max": 1.515974998474121, "sampling/importance_sampling_ratio/mean": 0.5848462879657745, "sampling/importance_sampling_ratio/min": 0.007934474485712903, "sampling/sampling_logp_difference/max": 1.7752972443898518, "sampling/sampling_logp_difference/mean": 0.0050101686889926595, "step": 3630, "step_time": 9.658845658600331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 3000.0, "completions/max_terminated_length": 1807.0, "completions/mean_length": 706.5, "completions/mean_terminated_length": 380.1060028076172, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.020791074633598326, "epoch": 0.4375, "frac_reward_zero_std": 0.125, "grad_norm": 0.005837175529450178, "learning_rate": 5.626201923076923e-07, "loss": 0.0096, "num_tokens": 79922036.0, "reward": 0.6090506315231323, "reward_std": 0.3914608806371689, "rewards/reward_fn/mean": 0.6090506315231323, "rewards/reward_fn/std": 0.3914608508348465, "sampling/importance_sampling_ratio/max": 1.100137710571289, "sampling/importance_sampling_ratio/mean": 0.4729166477918625, "sampling/importance_sampling_ratio/min": 7.34604946046602e-05, "sampling/sampling_logp_difference/max": 1.696777582168579, "sampling/sampling_logp_difference/mean": 0.004867844516411424, "step": 3640, "step_time": 6.815530110057443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2604166666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 1930.3333333333333, "completions/mean_length": 1197.9166666666667, "completions/mean_terminated_length": 582.2667846679688, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.026708983443677427, "epoch": 0.4387019230769231, "frac_reward_zero_std": 0.0, "grad_norm": 0.006354243494570255, "learning_rate": 5.614182692307692e-07, "loss": -0.008, "num_tokens": 80139188.0, "reward": 0.7407690286636353, "reward_std": 0.25745318333307904, "rewards/reward_fn/mean": 0.7407690286636353, "rewards/reward_fn/std": 0.25745317836602527, "sampling/importance_sampling_ratio/max": 1.774131178855896, "sampling/importance_sampling_ratio/mean": 0.3235058585802714, "sampling/importance_sampling_ratio/min": 0.0001360882330724659, "sampling/sampling_logp_difference/max": 3.4706337451934814, "sampling/sampling_logp_difference/mean": 0.006211093161255121, "step": 3650, "step_time": 10.319109088368714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2650.0, "completions/mean_length": 1635.59375, "completions/mean_terminated_length": 869.2428588867188, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.024806672148406504, "epoch": 0.43990384615384615, "frac_reward_zero_std": 0.0, "grad_norm": 0.007179643958806992, "learning_rate": 5.602163461538461e-07, "loss": -0.0035, "num_tokens": 80323938.0, "reward": 0.6943228542804718, "reward_std": 0.26617442071437836, "rewards/reward_fn/mean": 0.6943228542804718, "rewards/reward_fn/std": 0.26617440581321716, "sampling/importance_sampling_ratio/max": 1.7641507387161255, "sampling/importance_sampling_ratio/mean": 0.22803548723459244, "sampling/importance_sampling_ratio/min": 9.484677684667986e-05, "sampling/sampling_logp_difference/max": 2.509170413017273, "sampling/sampling_logp_difference/mean": 0.005356281064450741, "step": 3660, "step_time": 7.011028286535293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.23958333333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 2291.0, "completions/mean_length": 1201.6458333333333, "completions/mean_terminated_length": 627.1807047526041, "completions/min_length": 159.66666666666666, "completions/min_terminated_length": 159.66666666666666, "entropy": 0.024183417297899724, "epoch": 0.4411057692307692, "frac_reward_zero_std": 0.0, "grad_norm": 0.006046023685485125, "learning_rate": 5.590144230769231e-07, "loss": 0.0037, "num_tokens": 80524728.0, "reward": 0.7585883537928263, "reward_std": 0.27022309601306915, "rewards/reward_fn/mean": 0.7585883537928263, "rewards/reward_fn/std": 0.27022310098012287, "sampling/importance_sampling_ratio/max": 1.4888374010721843, "sampling/importance_sampling_ratio/mean": 0.33045106132825214, "sampling/importance_sampling_ratio/min": 3.908019942538398e-05, "sampling/sampling_logp_difference/max": 7.7731514771779375, "sampling/sampling_logp_difference/mean": 0.005858076736330986, "step": 3670, "step_time": 10.123915529157966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 3000.0, "completions/max_terminated_length": 1641.0, "completions/mean_length": 883.171875, "completions/mean_terminated_length": 482.818603515625, "completions/min_length": 116.5, "completions/min_terminated_length": 116.5, "entropy": 0.024571815878152846, "epoch": 0.4423076923076923, "frac_reward_zero_std": 0.125, "grad_norm": 0.0117906229570508, "learning_rate": 5.578125e-07, "loss": -0.004, "num_tokens": 80651795.0, "reward": 0.8107866048812866, "reward_std": 0.22172196209430695, "rewards/reward_fn/mean": 0.8107866048812866, "rewards/reward_fn/std": 0.22172196954488754, "sampling/importance_sampling_ratio/max": 1.2676392197608948, "sampling/importance_sampling_ratio/mean": 0.4233025312423706, "sampling/importance_sampling_ratio/min": 0.0007828312955098227, "sampling/sampling_logp_difference/max": 1.120188593864441, "sampling/sampling_logp_difference/mean": 0.004528814693912864, "step": 3680, "step_time": 6.813468335196376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.23958333333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 2096.0, "completions/mean_length": 1182.96875, "completions/mean_terminated_length": 610.8094482421875, "completions/min_length": 101.66666666666667, "completions/min_terminated_length": 101.66666666666667, "entropy": 0.01910180663689971, "epoch": 0.44350961538461536, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.004460582975298166, "learning_rate": 5.566105769230768e-07, "loss": 0.0042, "num_tokens": 80894328.0, "reward": 0.7295627593994141, "reward_std": 0.27652858197689056, "rewards/reward_fn/mean": 0.7295627593994141, "rewards/reward_fn/std": 0.2765285869439443, "sampling/importance_sampling_ratio/max": 1.0974765221277873, "sampling/importance_sampling_ratio/mean": 0.3767570952574412, "sampling/importance_sampling_ratio/min": 0.0001251522176820193, "sampling/sampling_logp_difference/max": 2.8577704429626465, "sampling/sampling_logp_difference/mean": 0.004908320493996143, "step": 3690, "step_time": 10.397424120362848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 3000.0, "completions/max_terminated_length": 2457.5, "completions/mean_length": 1520.140625, "completions/mean_terminated_length": 562.9050903320312, "completions/min_length": 132.5, "completions/min_terminated_length": 132.5, "entropy": 0.02149686962366104, "epoch": 0.44471153846153844, "frac_reward_zero_std": 0.125, "grad_norm": 0.001616193214431405, "learning_rate": 5.554086538461539e-07, "loss": -0.0017, "num_tokens": 81068313.0, "reward": 0.7059689462184906, "reward_std": 0.25784924626350403, "rewards/reward_fn/mean": 0.7059689462184906, "rewards/reward_fn/std": 0.25784923136234283, "sampling/importance_sampling_ratio/max": 1.4252293705940247, "sampling/importance_sampling_ratio/mean": 0.3295500725507736, "sampling/importance_sampling_ratio/min": 7.767826173221692e-05, "sampling/sampling_logp_difference/max": 2.441020965576172, "sampling/sampling_logp_difference/mean": 0.0041576530784368515, "step": 3700, "step_time": 7.036229893565178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 3000.0, "completions/max_terminated_length": 2052.3333333333335, "completions/mean_length": 1140.375, "completions/mean_terminated_length": 617.4431559244791, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.02625011410564184, "epoch": 0.44591346153846156, "frac_reward_zero_std": 0.0, "grad_norm": 0.006864062510430813, "learning_rate": 5.542067307692307e-07, "loss": 0.0113, "num_tokens": 81271749.0, "reward": 0.7820551991462708, "reward_std": 0.23985389371713003, "rewards/reward_fn/mean": 0.7820551991462708, "rewards/reward_fn/std": 0.2398538887500763, "sampling/importance_sampling_ratio/max": 1.205061634381612, "sampling/importance_sampling_ratio/mean": 0.32274489601453143, "sampling/importance_sampling_ratio/min": 8.742806471673248e-06, "sampling/sampling_logp_difference/max": 3.1410563786824546, "sampling/sampling_logp_difference/mean": 0.005791100673377514, "step": 3710, "step_time": 10.29101259149611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2117.5, "completions/mean_length": 1337.140625, "completions/mean_terminated_length": 573.3166961669922, "completions/min_length": 152.5, "completions/min_terminated_length": 152.5, "entropy": 0.02127139214426279, "epoch": 0.44711538461538464, "frac_reward_zero_std": 0.125, "grad_norm": 0.009971297346055508, "learning_rate": 5.530048076923076e-07, "loss": -0.0013, "num_tokens": 81414014.0, "reward": 0.6527042388916016, "reward_std": 0.3055068552494049, "rewards/reward_fn/mean": 0.6527042388916016, "rewards/reward_fn/std": 0.3055068552494049, "sampling/importance_sampling_ratio/max": 2.001336097717285, "sampling/importance_sampling_ratio/mean": 0.4336584508419037, "sampling/importance_sampling_ratio/min": 0.00011314430582487489, "sampling/sampling_logp_difference/max": 2.3058934211730957, "sampling/sampling_logp_difference/mean": 0.004142601625062525, "step": 3720, "step_time": 7.071407460514456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2708333333333333, "completions/max_length": 3000.0, "completions/max_terminated_length": 2278.6666666666665, "completions/mean_length": 1264.4895833333333, "completions/mean_terminated_length": 587.4753011067709, "completions/min_length": 153.66666666666666, "completions/min_terminated_length": 153.66666666666666, "entropy": 0.022499034646898508, "epoch": 0.4483173076923077, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.004179755225777626, "learning_rate": 5.518028846153846e-07, "loss": -0.0034, "num_tokens": 81647069.0, "reward": 0.6692634125550588, "reward_std": 0.27711590627829236, "rewards/reward_fn/mean": 0.6692634125550588, "rewards/reward_fn/std": 0.2771158864100774, "sampling/importance_sampling_ratio/max": 1.3829553922017415, "sampling/importance_sampling_ratio/mean": 0.311348557472229, "sampling/importance_sampling_ratio/min": 9.3430930443598e-05, "sampling/sampling_logp_difference/max": 4.197641372680664, "sampling/sampling_logp_difference/mean": 0.005109287798404694, "step": 3730, "step_time": 10.169021792709827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2214.0, "completions/max_terminated_length": 2024.5, "completions/mean_length": 553.796875, "completions/mean_terminated_length": 438.7801818847656, "completions/min_length": 103.5, "completions/min_terminated_length": 103.5, "entropy": 0.02397573124617338, "epoch": 0.4495192307692308, "frac_reward_zero_std": 0.125, "grad_norm": 0.12126105278730392, "learning_rate": 5.506009615384615e-07, "loss": -0.0203, "num_tokens": 81737144.0, "reward": 0.7465563118457794, "reward_std": 0.26540056616067886, "rewards/reward_fn/mean": 0.7465563118457794, "rewards/reward_fn/std": 0.26540056616067886, "sampling/importance_sampling_ratio/max": 1.3434414863586426, "sampling/importance_sampling_ratio/mean": 0.5139709115028381, "sampling/importance_sampling_ratio/min": 0.004421885594638297, "sampling/sampling_logp_difference/max": 2.123106837272644, "sampling/sampling_logp_difference/mean": 0.004926498979330063, "step": 3740, "step_time": 5.006534438114613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.22916666666666666, "completions/max_length": 3000.0, "completions/max_terminated_length": 2680.6666666666665, "completions/mean_length": 1100.625, "completions/mean_terminated_length": 539.219960530599, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.020107254944741727, "epoch": 0.45072115384615385, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.03870681673288345, "learning_rate": 5.493990384615385e-07, "loss": 0.0015, "num_tokens": 81925012.0, "reward": 0.6530254979928335, "reward_std": 0.26912033557891846, "rewards/reward_fn/mean": 0.6530254979928335, "rewards/reward_fn/std": 0.26912034551302594, "sampling/importance_sampling_ratio/max": 1.2953778505325317, "sampling/importance_sampling_ratio/mean": 0.4553590714931488, "sampling/importance_sampling_ratio/min": 8.002909279033095e-05, "sampling/sampling_logp_difference/max": 3.2603975931803384, "sampling/sampling_logp_difference/mean": 0.004407146635154883, "step": 3750, "step_time": 10.21600504303351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2415.5, "completions/mean_length": 1399.90625, "completions/mean_terminated_length": 570.2860565185547, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.021035873517394065, "epoch": 0.4519230769230769, "frac_reward_zero_std": 0.0, "grad_norm": 0.002956206211820245, "learning_rate": 5.481971153846153e-07, "loss": 0.0004, "num_tokens": 82080742.0, "reward": 0.7211422622203827, "reward_std": 0.2467539831995964, "rewards/reward_fn/mean": 0.7211422622203827, "rewards/reward_fn/std": 0.2467539757490158, "sampling/importance_sampling_ratio/max": 1.6763988733291626, "sampling/importance_sampling_ratio/mean": 0.3013821244239807, "sampling/importance_sampling_ratio/min": 0.0006381694383890135, "sampling/sampling_logp_difference/max": 1.5463486909866333, "sampling/sampling_logp_difference/mean": 0.004779732087627053, "step": 3760, "step_time": 7.064250679221004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 3000.0, "completions/max_terminated_length": 1879.6666666666667, "completions/mean_length": 1252.7708333333333, "completions/mean_terminated_length": 465.4569498697917, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.02204953581094742, "epoch": 0.453125, "frac_reward_zero_std": 0.0, "grad_norm": 0.004937006626278162, "learning_rate": 5.469951923076923e-07, "loss": -0.0052, "num_tokens": 82310608.0, "reward": 0.7215341130892435, "reward_std": 0.2572016219298045, "rewards/reward_fn/mean": 0.7215341130892435, "rewards/reward_fn/std": 0.257201611995697, "sampling/importance_sampling_ratio/max": 1.351334571838379, "sampling/importance_sampling_ratio/mean": 0.30103325843811035, "sampling/importance_sampling_ratio/min": 0.00013472099950225433, "sampling/sampling_logp_difference/max": 2.2470789750417075, "sampling/sampling_logp_difference/mean": 0.005332771843920152, "step": 3770, "step_time": 10.139252594206482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 3000.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 1186.734375, "completions/mean_terminated_length": 434.39198303222656, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.01915734652429819, "epoch": 0.4543269230769231, "frac_reward_zero_std": 0.125, "grad_norm": 0.011592148803174496, "learning_rate": 5.457932692307692e-07, "loss": -0.0044, "num_tokens": 82464759.0, "reward": 0.72163987159729, "reward_std": 0.24269109964370728, "rewards/reward_fn/mean": 0.72163987159729, "rewards/reward_fn/std": 0.24269109219312668, "sampling/importance_sampling_ratio/max": 1.222329020500183, "sampling/importance_sampling_ratio/mean": 0.4026424363255501, "sampling/importance_sampling_ratio/min": 2.4643094548082445e-05, "sampling/sampling_logp_difference/max": 2.7872135639190674, "sampling/sampling_logp_difference/mean": 0.004399893106892705, "step": 3780, "step_time": 7.044334849435836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.20833333333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 1464.6666666666667, "completions/mean_length": 968.9270833333334, "completions/mean_terminated_length": 423.1800842285156, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.023342660069465636, "epoch": 0.45552884615384615, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.009153934195637703, "learning_rate": 5.445913461538462e-07, "loss": -0.0001, "num_tokens": 82646688.0, "reward": 0.7125287652015686, "reward_std": 0.2782345811525981, "rewards/reward_fn/mean": 0.7125287652015686, "rewards/reward_fn/std": 0.2782345712184906, "sampling/importance_sampling_ratio/max": 1.4039932489395142, "sampling/importance_sampling_ratio/mean": 0.42189399401346844, "sampling/importance_sampling_ratio/min": 0.000593263351523395, "sampling/sampling_logp_difference/max": 2.894390106201172, "sampling/sampling_logp_difference/mean": 0.004777644916127126, "step": 3790, "step_time": 10.115822135843336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 3000.0, "completions/max_terminated_length": 1826.0, "completions/mean_length": 856.09375, "completions/mean_terminated_length": 362.2785186767578, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.02187449112534523, "epoch": 0.4567307692307692, "frac_reward_zero_std": 0.0, "grad_norm": 0.018033673986792564, "learning_rate": 5.43389423076923e-07, "loss": -0.0053, "num_tokens": 82773110.0, "reward": 0.783909797668457, "reward_std": 0.220066137611866, "rewards/reward_fn/mean": 0.783909797668457, "rewards/reward_fn/std": 0.2200661227107048, "sampling/importance_sampling_ratio/max": 1.6079595685005188, "sampling/importance_sampling_ratio/mean": 0.4821605980396271, "sampling/importance_sampling_ratio/min": 2.7319620130583644e-05, "sampling/sampling_logp_difference/max": 3.846637725830078, "sampling/sampling_logp_difference/mean": 0.005369074642658234, "step": 3800, "step_time": 6.790612826868892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3229166666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 2109.3333333333335, "completions/mean_length": 1425.7708333333333, "completions/mean_terminated_length": 701.6283976236979, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.022814785689115526, "epoch": 0.4579326923076923, "frac_reward_zero_std": 0.0, "grad_norm": 0.01288191881030798, "learning_rate": 5.421875e-07, "loss": -0.0019, "num_tokens": 83018608.0, "reward": 0.6977747281392416, "reward_std": 0.28084121147791546, "rewards/reward_fn/mean": 0.6977747281392416, "rewards/reward_fn/std": 0.28084121147791546, "sampling/importance_sampling_ratio/max": 1.35777614514033, "sampling/importance_sampling_ratio/mean": 0.2922083189090093, "sampling/importance_sampling_ratio/min": 0.0005960864242903577, "sampling/sampling_logp_difference/max": 1.5968427260716755, "sampling/sampling_logp_difference/mean": 0.004935992105553548, "step": 3810, "step_time": 10.398668766953051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 3000.0, "completions/max_terminated_length": 1842.5, "completions/mean_length": 1527.78125, "completions/mean_terminated_length": 616.0000305175781, "completions/min_length": 147.5, "completions/min_terminated_length": 147.5, "entropy": 0.025188893266022205, "epoch": 0.45913461538461536, "frac_reward_zero_std": 0.0, "grad_norm": 0.013438488356769085, "learning_rate": 5.409855769230769e-07, "loss": -0.0059, "num_tokens": 83183434.0, "reward": 0.6699011325836182, "reward_std": 0.2488022968173027, "rewards/reward_fn/mean": 0.6699011325836182, "rewards/reward_fn/std": 0.2488022968173027, "sampling/importance_sampling_ratio/max": 1.5035873055458069, "sampling/importance_sampling_ratio/mean": 0.28930944204330444, "sampling/importance_sampling_ratio/min": 1.2702704452749458e-05, "sampling/sampling_logp_difference/max": 2.5285332202911377, "sampling/sampling_logp_difference/mean": 0.005484763300046325, "step": 3820, "step_time": 7.013699228130281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3000.0, "completions/max_terminated_length": 2545.3333333333335, "completions/mean_length": 1304.625, "completions/mean_terminated_length": 740.0343831380209, "completions/min_length": 182.66666666666666, "completions/min_terminated_length": 182.66666666666666, "entropy": 0.02429873263463378, "epoch": 0.46033653846153844, "frac_reward_zero_std": 0.0, "grad_norm": 0.021525384858250618, "learning_rate": 5.397836538461538e-07, "loss": 0.004, "num_tokens": 83416958.0, "reward": 0.7911117275555929, "reward_std": 0.2086756726106008, "rewards/reward_fn/mean": 0.7911117275555929, "rewards/reward_fn/std": 0.20867566267649332, "sampling/importance_sampling_ratio/max": 1.7594878276189168, "sampling/importance_sampling_ratio/mean": 0.34931068619092304, "sampling/importance_sampling_ratio/min": 0.00012000137417089718, "sampling/sampling_logp_difference/max": 2.4725467363993325, "sampling/sampling_logp_difference/mean": 0.005198771754900615, "step": 3830, "step_time": 10.426853241864592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 3000.0, "completions/max_terminated_length": 1686.0, "completions/mean_length": 1433.65625, "completions/mean_terminated_length": 644.1597290039062, "completions/min_length": 168.5, "completions/min_terminated_length": 168.5, "entropy": 0.02573372106999159, "epoch": 0.46153846153846156, "frac_reward_zero_std": 0.0, "grad_norm": 0.007081238087266684, "learning_rate": 5.385817307692308e-07, "loss": -0.0096, "num_tokens": 83582872.0, "reward": 0.729054182767868, "reward_std": 0.23135247081518173, "rewards/reward_fn/mean": 0.729054182767868, "rewards/reward_fn/std": 0.23135247081518173, "sampling/importance_sampling_ratio/max": 1.3036720156669617, "sampling/importance_sampling_ratio/mean": 0.2942059636116028, "sampling/importance_sampling_ratio/min": 4.114293687962345e-06, "sampling/sampling_logp_difference/max": 2.814301609992981, "sampling/sampling_logp_difference/mean": 0.005442620953544974, "step": 3840, "step_time": 7.013926535472274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2244.6666666666665, "completions/mean_length": 1307.4791666666667, "completions/mean_terminated_length": 570.3138427734375, "completions/min_length": 111.33333333333333, "completions/min_terminated_length": 111.33333333333333, "entropy": 0.02232547663152218, "epoch": 0.46274038461538464, "frac_reward_zero_std": 0.16666666666666666, "grad_norm": 0.0023639812134206295, "learning_rate": 5.373798076923076e-07, "loss": -0.0082, "num_tokens": 83829318.0, "reward": 0.5819969276587168, "reward_std": 0.30513935287793476, "rewards/reward_fn/mean": 0.5819969276587168, "rewards/reward_fn/std": 0.30513934791088104, "sampling/importance_sampling_ratio/max": 1.3883811235427856, "sampling/importance_sampling_ratio/mean": 0.3847511013348897, "sampling/importance_sampling_ratio/min": 1.6073466516293895e-05, "sampling/sampling_logp_difference/max": 2.663541396458944, "sampling/sampling_logp_difference/mean": 0.004678390454500914, "step": 3850, "step_time": 10.005850830953568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3000.0, "completions/max_terminated_length": 2629.0, "completions/mean_length": 1302.421875, "completions/mean_terminated_length": 720.5178833007812, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.02269310224801302, "epoch": 0.4639423076923077, "frac_reward_zero_std": 0.0, "grad_norm": 0.011808991432189941, "learning_rate": 5.361778846153846e-07, "loss": -0.0057, "num_tokens": 83986689.0, "reward": 0.7704191207885742, "reward_std": 0.24263237416744232, "rewards/reward_fn/mean": 0.7704191207885742, "rewards/reward_fn/std": 0.24263238161802292, "sampling/importance_sampling_ratio/max": 1.365980863571167, "sampling/importance_sampling_ratio/mean": 0.34509213268756866, "sampling/importance_sampling_ratio/min": 8.85126519278856e-06, "sampling/sampling_logp_difference/max": 2.7683509588241577, "sampling/sampling_logp_difference/mean": 0.005631244042888284, "step": 3860, "step_time": 7.1256698335520925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 3000.0, "completions/max_terminated_length": 2351.0, "completions/mean_length": 1657.4583333333333, "completions/mean_terminated_length": 747.7833048502604, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.02533762715756893, "epoch": 0.4651442307692308, "frac_reward_zero_std": 0.0, "grad_norm": 0.0009786299197003245, "learning_rate": 5.349759615384615e-07, "loss": -0.0014, "num_tokens": 84274733.0, "reward": 0.6555309693018595, "reward_std": 0.26255930463473004, "rewards/reward_fn/mean": 0.6555309693018595, "rewards/reward_fn/std": 0.26255930463473004, "sampling/importance_sampling_ratio/max": 1.254625678062439, "sampling/importance_sampling_ratio/mean": 0.20592126001914343, "sampling/importance_sampling_ratio/min": 0.0001787897890608292, "sampling/sampling_logp_difference/max": 3.37257448832194, "sampling/sampling_logp_difference/mean": 0.005161165414998929, "step": 3870, "step_time": 10.509967870172114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2650.5, "completions/mean_length": 1334.234375, "completions/mean_terminated_length": 577.0682067871094, "completions/min_length": 151.5, "completions/min_terminated_length": 151.5, "entropy": 0.021492561139166356, "epoch": 0.46634615384615385, "frac_reward_zero_std": 0.0, "grad_norm": 0.01209369208663702, "learning_rate": 5.337740384615384e-07, "loss": 0.0171, "num_tokens": 84433596.0, "reward": 0.7179509401321411, "reward_std": 0.25800077617168427, "rewards/reward_fn/mean": 0.7179509401321411, "rewards/reward_fn/std": 0.25800077617168427, "sampling/importance_sampling_ratio/max": 1.159534752368927, "sampling/importance_sampling_ratio/mean": 0.37775640189647675, "sampling/importance_sampling_ratio/min": 6.683233357307472e-05, "sampling/sampling_logp_difference/max": 3.34143602848053, "sampling/sampling_logp_difference/mean": 0.004576103528961539, "step": 3880, "step_time": 7.272756580170244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2023.3333333333333, "completions/mean_length": 1640.1666666666667, "completions/mean_terminated_length": 640.0776672363281, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.022220754623413087, "epoch": 0.4675480769230769, "frac_reward_zero_std": 0.16666666666666666, "grad_norm": 0.00985984317958355, "learning_rate": 5.325721153846154e-07, "loss": -0.0028, "num_tokens": 84707244.0, "reward": 0.6015624304612478, "reward_std": 0.31388827164967853, "rewards/reward_fn/mean": 0.6015624304612478, "rewards/reward_fn/std": 0.31388827164967853, "sampling/importance_sampling_ratio/max": 1.5744770367940266, "sampling/importance_sampling_ratio/mean": 0.28722960750261944, "sampling/importance_sampling_ratio/min": 0.00023409474732716262, "sampling/sampling_logp_difference/max": 3.1475271383921304, "sampling/sampling_logp_difference/mean": 0.004979953480263551, "step": 3890, "step_time": 10.477869909908623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2946.5, "completions/mean_length": 1144.015625, "completions/mean_terminated_length": 704.1273193359375, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.021826814673841, "epoch": 0.46875, "frac_reward_zero_std": 0.25, "grad_norm": 0.011549496091902256, "learning_rate": 5.313701923076923e-07, "loss": -0.0023, "num_tokens": 84844749.0, "reward": 0.7482384443283081, "reward_std": 0.23946453630924225, "rewards/reward_fn/mean": 0.7482384443283081, "rewards/reward_fn/std": 0.23946452140808105, "sampling/importance_sampling_ratio/max": 1.7381431460380554, "sampling/importance_sampling_ratio/mean": 0.42611292749643326, "sampling/importance_sampling_ratio/min": 8.318685650010593e-05, "sampling/sampling_logp_difference/max": 2.31910240650177, "sampling/sampling_logp_difference/mean": 0.004360313527286053, "step": 3900, "step_time": 7.056101913005113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.22916666666666666, "completions/max_length": 3000.0, "completions/max_terminated_length": 2666.3333333333335, "completions/mean_length": 1158.9375, "completions/mean_terminated_length": 638.0911865234375, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.025340541265904903, "epoch": 0.4699519230769231, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.011751241981983185, "learning_rate": 5.301682692307692e-07, "loss": -0.0056, "num_tokens": 85078551.0, "reward": 0.6632635196050009, "reward_std": 0.32398782173792523, "rewards/reward_fn/mean": 0.6632635196050009, "rewards/reward_fn/std": 0.32398782173792523, "sampling/importance_sampling_ratio/max": 1.0201916495958965, "sampling/importance_sampling_ratio/mean": 0.33759840329488117, "sampling/importance_sampling_ratio/min": 2.7488600608194247e-05, "sampling/sampling_logp_difference/max": 2.943089405695597, "sampling/sampling_logp_difference/mean": 0.005368585543086131, "step": 3910, "step_time": 10.310371448006482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 3000.0, "completions/max_terminated_length": 1762.0, "completions/mean_length": 1788.859375, "completions/mean_terminated_length": 655.4485168457031, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "entropy": 0.023373505845665932, "epoch": 0.47115384615384615, "frac_reward_zero_std": 0.0, "grad_norm": 0.007588670589029789, "learning_rate": 5.289663461538462e-07, "loss": 0.0118, "num_tokens": 85260286.0, "reward": 0.6793208420276642, "reward_std": 0.25250694155693054, "rewards/reward_fn/mean": 0.6793208420276642, "rewards/reward_fn/std": 0.25250695645809174, "sampling/importance_sampling_ratio/max": 0.7590655088424683, "sampling/importance_sampling_ratio/mean": 0.16207462176680565, "sampling/importance_sampling_ratio/min": 8.560838705307106e-05, "sampling/sampling_logp_difference/max": 2.2102792859077454, "sampling/sampling_logp_difference/mean": 0.004599933512508869, "step": 3920, "step_time": 7.162863396666944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3333333333333333, "completions/max_length": 3000.0, "completions/max_terminated_length": 2276.6666666666665, "completions/mean_length": 1401.2083333333333, "completions/mean_terminated_length": 575.7235412597656, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.020678754709661008, "epoch": 0.4723557692307692, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.004572604317218065, "learning_rate": 5.27764423076923e-07, "loss": -0.0044, "num_tokens": 85505058.0, "reward": 0.6440707643826803, "reward_std": 0.3061645378669103, "rewards/reward_fn/mean": 0.6440707643826803, "rewards/reward_fn/std": 0.30616452793280285, "sampling/importance_sampling_ratio/max": 1.8855229218800862, "sampling/importance_sampling_ratio/mean": 0.3304123481114705, "sampling/importance_sampling_ratio/min": 3.936363597745185e-05, "sampling/sampling_logp_difference/max": 2.7600855827331543, "sampling/sampling_logp_difference/mean": 0.004649084061384201, "step": 3930, "step_time": 10.361507259309292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3000.0, "completions/max_terminated_length": 1130.5, "completions/mean_length": 1066.984375, "completions/mean_terminated_length": 431.20054626464844, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.02153849108144641, "epoch": 0.4735576923076923, "frac_reward_zero_std": 0.0, "grad_norm": 0.006838290952146053, "learning_rate": 5.265625e-07, "loss": 0.0008, "num_tokens": 85649489.0, "reward": 0.6911381185054779, "reward_std": 0.25969382375478745, "rewards/reward_fn/mean": 0.6911381185054779, "rewards/reward_fn/std": 0.25969382375478745, "sampling/importance_sampling_ratio/max": 1.7660715579986572, "sampling/importance_sampling_ratio/mean": 0.5256288200616837, "sampling/importance_sampling_ratio/min": 3.667022878062198e-05, "sampling/sampling_logp_difference/max": 1.4870920181274414, "sampling/sampling_logp_difference/mean": 0.004934657132253051, "step": 3940, "step_time": 6.876864640973508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3000.0, "completions/max_terminated_length": 2062.0, "completions/mean_length": 1178.0625, "completions/mean_terminated_length": 586.052256266276, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.02607654333114624, "epoch": 0.47475961538461536, "frac_reward_zero_std": 0.0, "grad_norm": 0.02274031564593315, "learning_rate": 5.253605769230769e-07, "loss": -0.0095, "num_tokens": 85869415.0, "reward": 0.721182664235433, "reward_std": 0.2673708299795787, "rewards/reward_fn/mean": 0.721182664235433, "rewards/reward_fn/std": 0.2673708150784175, "sampling/importance_sampling_ratio/max": 2.255953033765157, "sampling/importance_sampling_ratio/mean": 0.4263811508814494, "sampling/importance_sampling_ratio/min": 3.0034959308977704e-05, "sampling/sampling_logp_difference/max": 2.0539691050847373, "sampling/sampling_logp_difference/mean": 0.005688841144243876, "step": 3950, "step_time": 10.140396796166897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 3000.0, "completions/max_terminated_length": 1484.5, "completions/mean_length": 1145.15625, "completions/mean_terminated_length": 424.17234802246094, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.023019907251000404, "epoch": 0.47596153846153844, "frac_reward_zero_std": 0.125, "grad_norm": 0.009342481382191181, "learning_rate": 5.241586538461539e-07, "loss": -0.003, "num_tokens": 86028305.0, "reward": 0.6415188908576965, "reward_std": 0.3119131550192833, "rewards/reward_fn/mean": 0.6415188908576965, "rewards/reward_fn/std": 0.3119131624698639, "sampling/importance_sampling_ratio/max": 1.2638421058654785, "sampling/importance_sampling_ratio/mean": 0.37677253782749176, "sampling/importance_sampling_ratio/min": 5.9393055209966406e-05, "sampling/sampling_logp_difference/max": 3.197060704231262, "sampling/sampling_logp_difference/mean": 0.0047236590180546045, "step": 3960, "step_time": 7.1620711530558765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.19791666666666666, "completions/max_length": 3000.0, "completions/max_terminated_length": 2204.3333333333335, "completions/mean_length": 1005.3333333333334, "completions/mean_terminated_length": 520.9571228027344, "completions/min_length": 115.33333333333333, "completions/min_terminated_length": 115.33333333333333, "entropy": 0.021827303059399127, "epoch": 0.47716346153846156, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.054848119616508484, "learning_rate": 5.229567307692307e-07, "loss": 0.0033, "num_tokens": 86234737.0, "reward": 0.6824521621068319, "reward_std": 0.3178696781396866, "rewards/reward_fn/mean": 0.6824521621068319, "rewards/reward_fn/std": 0.31786968807379407, "sampling/importance_sampling_ratio/max": 1.8916613658269246, "sampling/importance_sampling_ratio/mean": 0.45142223437627155, "sampling/importance_sampling_ratio/min": 7.216187502005293e-05, "sampling/sampling_logp_difference/max": 2.079346776008606, "sampling/sampling_logp_difference/mean": 0.005380969649801652, "step": 3970, "step_time": 10.09076000051573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 3000.0, "completions/max_terminated_length": 1924.5, "completions/mean_length": 1531.09375, "completions/mean_terminated_length": 640.9083709716797, "completions/min_length": 202.5, "completions/min_terminated_length": 202.5, "entropy": 0.026307517290115358, "epoch": 0.47836538461538464, "frac_reward_zero_std": 0.0, "grad_norm": 0.004281352274119854, "learning_rate": 5.217548076923077e-07, "loss": -0.0072, "num_tokens": 86419751.0, "reward": 0.7120270133018494, "reward_std": 0.21943399310112, "rewards/reward_fn/mean": 0.7120270133018494, "rewards/reward_fn/std": 0.2194339856505394, "sampling/importance_sampling_ratio/max": 0.908089280128479, "sampling/importance_sampling_ratio/mean": 0.20215918123722076, "sampling/importance_sampling_ratio/min": 5.8482996792008635e-06, "sampling/sampling_logp_difference/max": 3.8323585987091064, "sampling/sampling_logp_difference/mean": 0.005735114449635148, "step": 3980, "step_time": 7.335940352920443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2554.0, "completions/mean_length": 1520.6666666666667, "completions/mean_terminated_length": 896.41455078125, "completions/min_length": 187.33333333333334, "completions/min_terminated_length": 187.33333333333334, "entropy": 0.022079765610396863, "epoch": 0.4795673076923077, "frac_reward_zero_std": 0.0, "grad_norm": 0.006691872142255306, "learning_rate": 5.205528846153846e-07, "loss": -0.0012, "num_tokens": 86696167.0, "reward": 0.7289026180903116, "reward_std": 0.25329072773456573, "rewards/reward_fn/mean": 0.7289026180903116, "rewards/reward_fn/std": 0.253290722767512, "sampling/importance_sampling_ratio/max": 1.558694839477539, "sampling/importance_sampling_ratio/mean": 0.30892200271288556, "sampling/importance_sampling_ratio/min": 0.00010552865652850112, "sampling/sampling_logp_difference/max": 2.562873919804891, "sampling/sampling_logp_difference/mean": 0.004824822768568993, "step": 3990, "step_time": 10.441138689685612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 3000.0, "completions/max_terminated_length": 2203.5, "completions/mean_length": 1492.78125, "completions/mean_terminated_length": 519.2174072265625, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.023895081505179404, "epoch": 0.4807692307692308, "frac_reward_zero_std": 0.0, "grad_norm": 0.012257472611963749, "learning_rate": 5.193509615384614e-07, "loss": -0.0115, "num_tokens": 86873729.0, "reward": 0.6734874546527863, "reward_std": 0.27278590202331543, "rewards/reward_fn/mean": 0.6734874546527863, "rewards/reward_fn/std": 0.27278590202331543, "sampling/importance_sampling_ratio/max": 1.1330320835113525, "sampling/importance_sampling_ratio/mean": 0.29661954939365387, "sampling/importance_sampling_ratio/min": 0.00033631404949119315, "sampling/sampling_logp_difference/max": 2.137817084789276, "sampling/sampling_logp_difference/mean": 0.005264968145638704, "step": 4000, "step_time": 7.253550639469177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3229166666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 1441.3333333333333, "completions/mean_length": 1314.2395833333333, "completions/mean_terminated_length": 513.1353454589844, "completions/min_length": 163.66666666666666, "completions/min_terminated_length": 163.66666666666666, "entropy": 0.02291204296052456, "epoch": 0.48197115384615385, "frac_reward_zero_std": 0.0, "grad_norm": 0.014570123516023159, "learning_rate": 5.181490384615385e-07, "loss": -0.0035, "num_tokens": 87087400.0, "reward": 0.6666877269744873, "reward_std": 0.2586547036965688, "rewards/reward_fn/mean": 0.6666877269744873, "rewards/reward_fn/std": 0.25865469376246136, "sampling/importance_sampling_ratio/max": 1.050152321656545, "sampling/importance_sampling_ratio/mean": 0.26347583532333374, "sampling/importance_sampling_ratio/min": 1.221718381808993e-05, "sampling/sampling_logp_difference/max": 2.2599066893259683, "sampling/sampling_logp_difference/mean": 0.00488471332937479, "step": 4010, "step_time": 10.231688349321484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 3000.0, "completions/max_terminated_length": 2301.5, "completions/mean_length": 1154.75, "completions/mean_terminated_length": 496.98728942871094, "completions/min_length": 150.5, "completions/min_terminated_length": 150.5, "entropy": 0.02347295479848981, "epoch": 0.4831730769230769, "frac_reward_zero_std": 0.0, "grad_norm": 0.007380289491266012, "learning_rate": 5.169471153846153e-07, "loss": 0.0079, "num_tokens": 87255096.0, "reward": 0.7760801613330841, "reward_std": 0.2228955253958702, "rewards/reward_fn/mean": 0.7760801613330841, "rewards/reward_fn/std": 0.2228955253958702, "sampling/importance_sampling_ratio/max": 1.0719402432441711, "sampling/importance_sampling_ratio/mean": 0.3213277906179428, "sampling/importance_sampling_ratio/min": 3.650441976787988e-05, "sampling/sampling_logp_difference/max": 2.1932361721992493, "sampling/sampling_logp_difference/mean": 0.005064585478976369, "step": 4020, "step_time": 6.918816917575896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3229166666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 1848.0, "completions/mean_length": 1277.0416666666667, "completions/mean_terminated_length": 449.62181599934894, "completions/min_length": 117.66666666666667, "completions/min_terminated_length": 117.66666666666667, "entropy": 0.02226821556687355, "epoch": 0.484375, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.019382476806640625, "learning_rate": 5.157451923076923e-07, "loss": -0.0038, "num_tokens": 87480796.0, "reward": 0.7182775735855103, "reward_std": 0.24577667315800986, "rewards/reward_fn/mean": 0.7182775735855103, "rewards/reward_fn/std": 0.24577665825684866, "sampling/importance_sampling_ratio/max": 1.6354806423187256, "sampling/importance_sampling_ratio/mean": 0.3775234321753184, "sampling/importance_sampling_ratio/min": 2.9625887922435368e-05, "sampling/sampling_logp_difference/max": 2.258217732111613, "sampling/sampling_logp_difference/mean": 0.00502436359723409, "step": 4030, "step_time": 10.378423369023949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 3000.0, "completions/max_terminated_length": 1896.0, "completions/mean_length": 1236.1875, "completions/mean_terminated_length": 634.0083465576172, "completions/min_length": 135.5, "completions/min_terminated_length": 135.5, "entropy": 0.022669595293700696, "epoch": 0.4855769230769231, "frac_reward_zero_std": 0.0, "grad_norm": 0.02358921244740486, "learning_rate": 5.145432692307692e-07, "loss": -0.0015, "num_tokens": 87622272.0, "reward": 0.7186433374881744, "reward_std": 0.2332073524594307, "rewards/reward_fn/mean": 0.7186433374881744, "rewards/reward_fn/std": 0.2332073450088501, "sampling/importance_sampling_ratio/max": 1.9141205549240112, "sampling/importance_sampling_ratio/mean": 0.3973607122898102, "sampling/importance_sampling_ratio/min": 0.0007871906636864878, "sampling/sampling_logp_difference/max": 3.0054137110710144, "sampling/sampling_logp_difference/mean": 0.005081284325569868, "step": 4040, "step_time": 7.110829407908023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 3000.0, "completions/max_terminated_length": 1983.6666666666667, "completions/mean_length": 1008.8854166666666, "completions/mean_terminated_length": 456.33971150716144, "completions/min_length": 134.66666666666666, "completions/min_terminated_length": 134.66666666666666, "entropy": 0.02673616595566273, "epoch": 0.48677884615384615, "frac_reward_zero_std": 0.0, "grad_norm": 0.010905119590461254, "learning_rate": 5.133413461538461e-07, "loss": -0.0027, "num_tokens": 87845021.0, "reward": 0.7696438630421957, "reward_std": 0.21852217614650726, "rewards/reward_fn/mean": 0.7696438630421957, "rewards/reward_fn/std": 0.2185221662123998, "sampling/importance_sampling_ratio/max": 1.6452685197194417, "sampling/importance_sampling_ratio/mean": 0.4285670320192973, "sampling/importance_sampling_ratio/min": 2.8978944105991406e-05, "sampling/sampling_logp_difference/max": 2.605972091356913, "sampling/sampling_logp_difference/mean": 0.005741505107531945, "step": 4050, "step_time": 10.179704700782896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2322.0, "completions/mean_length": 1203.28125, "completions/mean_terminated_length": 746.397705078125, "completions/min_length": 182.5, "completions/min_terminated_length": 182.5, "entropy": 0.02763825673609972, "epoch": 0.4879807692307692, "frac_reward_zero_std": 0.0, "grad_norm": 0.021731073036789894, "learning_rate": 5.121394230769231e-07, "loss": -0.0019, "num_tokens": 87975879.0, "reward": 0.6889011263847351, "reward_std": 0.30665314197540283, "rewards/reward_fn/mean": 0.6889011263847351, "rewards/reward_fn/std": 0.30665314197540283, "sampling/importance_sampling_ratio/max": 1.5190638303756714, "sampling/importance_sampling_ratio/mean": 0.29975301027297974, "sampling/importance_sampling_ratio/min": 0.00011885126150446013, "sampling/sampling_logp_difference/max": 3.6346203088760376, "sampling/sampling_logp_difference/mean": 0.00609151809476316, "step": 4060, "step_time": 7.1203361133113505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.17708333333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 2226.0, "completions/mean_length": 940.4791666666666, "completions/mean_terminated_length": 499.43353271484375, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.023048912920057774, "epoch": 0.4891826923076923, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.011420686729252338, "learning_rate": 5.109375e-07, "loss": 0.0124, "num_tokens": 88181245.0, "reward": 0.7076352039972941, "reward_std": 0.30992311239242554, "rewards/reward_fn/mean": 0.7076352039972941, "rewards/reward_fn/std": 0.30992311239242554, "sampling/importance_sampling_ratio/max": 1.7857158581415813, "sampling/importance_sampling_ratio/mean": 0.4709169864654541, "sampling/importance_sampling_ratio/min": 0.0001776394559177182, "sampling/sampling_logp_difference/max": 2.392155567804972, "sampling/sampling_logp_difference/mean": 0.004958336086322864, "step": 4070, "step_time": 10.345288851577788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 3000.0, "completions/max_terminated_length": 2737.0, "completions/mean_length": 1187.859375, "completions/mean_terminated_length": 536.6236267089844, "completions/min_length": 137.5, "completions/min_terminated_length": 137.5, "entropy": 0.023598380945622922, "epoch": 0.49038461538461536, "frac_reward_zero_std": 0.0, "grad_norm": 0.006842271890491247, "learning_rate": 5.097355769230769e-07, "loss": -0.0021, "num_tokens": 88324820.0, "reward": 0.7427308559417725, "reward_std": 0.24168141186237335, "rewards/reward_fn/mean": 0.7427308559417725, "rewards/reward_fn/std": 0.24168140441179276, "sampling/importance_sampling_ratio/max": 1.3582602143287659, "sampling/importance_sampling_ratio/mean": 0.3488955646753311, "sampling/importance_sampling_ratio/min": 2.5511059448035667e-05, "sampling/sampling_logp_difference/max": 1.994675636291504, "sampling/sampling_logp_difference/mean": 0.005208560731261969, "step": 4080, "step_time": 7.220381651632488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3541666666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 1857.6666666666667, "completions/mean_length": 1355.03125, "completions/mean_terminated_length": 453.53636678059894, "completions/min_length": 122.66666666666667, "completions/min_terminated_length": 122.66666666666667, "entropy": 0.024715470522642134, "epoch": 0.49158653846153844, "frac_reward_zero_std": 0.0, "grad_norm": 0.002875702455639839, "learning_rate": 5.085336538461539e-07, "loss": 0.0175, "num_tokens": 88574063.0, "reward": 0.6444600621859232, "reward_std": 0.2353552281856537, "rewards/reward_fn/mean": 0.6444600621859232, "rewards/reward_fn/std": 0.23535522321859995, "sampling/importance_sampling_ratio/max": 1.3441978693008423, "sampling/importance_sampling_ratio/mean": 0.36283700664838153, "sampling/importance_sampling_ratio/min": 0.0031541869400939504, "sampling/sampling_logp_difference/max": 2.799065907796224, "sampling/sampling_logp_difference/mean": 0.0052283381422360735, "step": 4090, "step_time": 10.253572104498744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 3000.0, "completions/max_terminated_length": 2740.0, "completions/mean_length": 1583.75, "completions/mean_terminated_length": 601.2549285888672, "completions/min_length": 111.5, "completions/min_terminated_length": 111.5, "entropy": 0.020000433828681708, "epoch": 0.49278846153846156, "frac_reward_zero_std": 0.0, "grad_norm": 0.004009260330349207, "learning_rate": 5.073317307692308e-07, "loss": -0.0071, "num_tokens": 88759815.0, "reward": 0.5904369652271271, "reward_std": 0.3395298421382904, "rewards/reward_fn/mean": 0.5904369652271271, "rewards/reward_fn/std": 0.339529812335968, "sampling/importance_sampling_ratio/max": 1.3773205280303955, "sampling/importance_sampling_ratio/mean": 0.31318482756614685, "sampling/importance_sampling_ratio/min": 0.00010788822692120448, "sampling/sampling_logp_difference/max": 2.9185925722122192, "sampling/sampling_logp_difference/mean": 0.004241576069034636, "step": 4100, "step_time": 7.275642320048064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3645833333333333, "completions/max_length": 3000.0, "completions/max_terminated_length": 1751.0, "completions/mean_length": 1375.6979166666667, "completions/mean_terminated_length": 444.94122314453125, "completions/min_length": 124.33333333333333, "completions/min_terminated_length": 124.33333333333333, "entropy": 0.021897288598120214, "epoch": 0.49399038461538464, "frac_reward_zero_std": 0.0, "grad_norm": 0.0061905416660010815, "learning_rate": 5.061298076923076e-07, "loss": 0.0042, "num_tokens": 88996106.0, "reward": 0.6658314863840739, "reward_std": 0.2852521439393361, "rewards/reward_fn/mean": 0.6658314863840739, "rewards/reward_fn/std": 0.2852521439393361, "sampling/importance_sampling_ratio/max": 1.9389790693918865, "sampling/importance_sampling_ratio/mean": 0.37744850913683575, "sampling/importance_sampling_ratio/min": 3.907968781883634e-05, "sampling/sampling_logp_difference/max": 3.9898322025934854, "sampling/sampling_logp_difference/mean": 0.004655508014063041, "step": 4110, "step_time": 10.321924190130085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 3000.0, "completions/max_terminated_length": 1612.0, "completions/mean_length": 1207.609375, "completions/mean_terminated_length": 452.29859924316406, "completions/min_length": 144.5, "completions/min_terminated_length": 144.5, "entropy": 0.025612014159560204, "epoch": 0.4951923076923077, "frac_reward_zero_std": 0.0, "grad_norm": 0.02689041942358017, "learning_rate": 5.049278846153846e-07, "loss": -0.0086, "num_tokens": 89153673.0, "reward": 0.7404713332653046, "reward_std": 0.23904280364513397, "rewards/reward_fn/mean": 0.7404713332653046, "rewards/reward_fn/std": 0.23904279619455338, "sampling/importance_sampling_ratio/max": 2.073059856891632, "sampling/importance_sampling_ratio/mean": 0.3902330696582794, "sampling/importance_sampling_ratio/min": 9.463063906878233e-05, "sampling/sampling_logp_difference/max": 3.28585422039032, "sampling/sampling_logp_difference/mean": 0.005712142447009683, "step": 4120, "step_time": 7.185384216438979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.20833333333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 1668.0, "completions/mean_length": 983.8020833333334, "completions/mean_terminated_length": 465.5592956542969, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.019900119677186013, "epoch": 0.4963942307692308, "frac_reward_zero_std": 0.16666666666666666, "grad_norm": 0.013725613243877888, "learning_rate": 5.037259615384615e-07, "loss": -0.0064, "num_tokens": 89339382.0, "reward": 0.737487276395162, "reward_std": 0.24547062317530313, "rewards/reward_fn/mean": 0.737487276395162, "rewards/reward_fn/std": 0.24547062317530313, "sampling/importance_sampling_ratio/max": 1.477178692817688, "sampling/importance_sampling_ratio/mean": 0.4495055576165517, "sampling/importance_sampling_ratio/min": 0.0002470368053764105, "sampling/sampling_logp_difference/max": 2.1388566493988037, "sampling/sampling_logp_difference/mean": 0.004818502813577652, "step": 4130, "step_time": 9.920870992168783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 3000.0, "completions/max_terminated_length": 2937.5, "completions/mean_length": 1005.328125, "completions/mean_terminated_length": 599.7471313476562, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.023520590178668498, "epoch": 0.49759615384615385, "frac_reward_zero_std": 0.0, "grad_norm": 0.004418111871927977, "learning_rate": 5.025240384615384e-07, "loss": -0.0083, "num_tokens": 89481683.0, "reward": 0.7851914465427399, "reward_std": 0.2310946062207222, "rewards/reward_fn/mean": 0.7851914465427399, "rewards/reward_fn/std": 0.231094591319561, "sampling/importance_sampling_ratio/max": 1.23127481341362, "sampling/importance_sampling_ratio/mean": 0.3580347150564194, "sampling/importance_sampling_ratio/min": 1.692633532002219e-05, "sampling/sampling_logp_difference/max": 2.6317033171653748, "sampling/sampling_logp_difference/mean": 0.0060803929809480906, "step": 4140, "step_time": 6.964744683355093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2708333333333333, "completions/max_length": 3000.0, "completions/max_terminated_length": 1905.0, "completions/mean_length": 1173.7291666666667, "completions/mean_terminated_length": 522.004638671875, "completions/min_length": 141.66666666666666, "completions/min_terminated_length": 141.66666666666666, "entropy": 0.02340581566095352, "epoch": 0.4987980769230769, "frac_reward_zero_std": 0.0, "grad_norm": 0.01468319445848465, "learning_rate": 5.013221153846154e-07, "loss": -0.0007, "num_tokens": 89695481.0, "reward": 0.6612536112467448, "reward_std": 0.3070327440897624, "rewards/reward_fn/mean": 0.6612536112467448, "rewards/reward_fn/std": 0.3070327242215474, "sampling/importance_sampling_ratio/max": 1.7174681822458904, "sampling/importance_sampling_ratio/mean": 0.3447679877281189, "sampling/importance_sampling_ratio/min": 3.2104321488664786e-06, "sampling/sampling_logp_difference/max": 2.911191383997599, "sampling/sampling_logp_difference/mean": 0.005472302126387755, "step": 4150, "step_time": 10.32663955045864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 3000.0, "completions/max_terminated_length": 1807.5, "completions/mean_length": 1355.90625, "completions/mean_terminated_length": 661.4713439941406, "completions/min_length": 251.5, "completions/min_terminated_length": 251.5, "entropy": 0.023941388912498952, "epoch": 0.5, "frac_reward_zero_std": 0.0, "grad_norm": 0.008116327226161957, "learning_rate": 5.001201923076923e-07, "loss": -0.0047, "num_tokens": 89858963.0, "reward": 0.631374716758728, "reward_std": 0.3178664743900299, "rewards/reward_fn/mean": 0.631374716758728, "rewards/reward_fn/std": 0.3178664445877075, "sampling/importance_sampling_ratio/max": 1.0874022543430328, "sampling/importance_sampling_ratio/mean": 0.22406920790672302, "sampling/importance_sampling_ratio/min": 1.502675877418369e-05, "sampling/sampling_logp_difference/max": 2.4954128861427307, "sampling/sampling_logp_difference/mean": 0.005584182916209102, "step": 4160, "step_time": 7.0380287823267285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.19791666666666666, "completions/max_length": 3000.0, "completions/max_terminated_length": 2170.3333333333335, "completions/mean_length": 911.5625, "completions/mean_terminated_length": 405.8488362630208, "completions/min_length": 134.66666666666666, "completions/min_terminated_length": 134.66666666666666, "entropy": 0.024240402691066264, "epoch": 0.5012019230769231, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.014913535676896572, "learning_rate": 4.989182692307691e-07, "loss": -0.0055, "num_tokens": 90050153.0, "reward": 0.7246877153714498, "reward_std": 0.2784367303053538, "rewards/reward_fn/mean": 0.7246877153714498, "rewards/reward_fn/std": 0.27843670547008514, "sampling/importance_sampling_ratio/max": 1.7580302158991497, "sampling/importance_sampling_ratio/mean": 0.47139497598012287, "sampling/importance_sampling_ratio/min": 0.0010714592763179098, "sampling/sampling_logp_difference/max": 2.401627699534098, "sampling/sampling_logp_difference/mean": 0.0052593859533468885, "step": 4170, "step_time": 10.004847680404783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2446.5, "completions/mean_length": 1010.984375, "completions/mean_terminated_length": 407.5357360839844, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.02326416876167059, "epoch": 0.5024038461538461, "frac_reward_zero_std": 0.125, "grad_norm": 0.0035191173665225506, "learning_rate": 4.977163461538461e-07, "loss": 0.0035, "num_tokens": 90198704.0, "reward": 0.7702483534812927, "reward_std": 0.2086462378501892, "rewards/reward_fn/mean": 0.7702483534812927, "rewards/reward_fn/std": 0.2086462378501892, "sampling/importance_sampling_ratio/max": 1.7985581159591675, "sampling/importance_sampling_ratio/mean": 0.5139364004135132, "sampling/importance_sampling_ratio/min": 5.894552668905817e-05, "sampling/sampling_logp_difference/max": 2.5025267601013184, "sampling/sampling_logp_difference/mean": 0.004964182619005442, "step": 4180, "step_time": 6.792256444320083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.22916666666666666, "completions/max_length": 3000.0, "completions/max_terminated_length": 1830.6666666666667, "completions/mean_length": 1050.625, "completions/mean_terminated_length": 483.70160420735675, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.02185952793806791, "epoch": 0.5036057692307693, "frac_reward_zero_std": 0.16666666666666666, "grad_norm": 0.004746431019157171, "learning_rate": 4.96514423076923e-07, "loss": -0.003, "num_tokens": 90399980.0, "reward": 0.7179300983746847, "reward_std": 0.2922264536221822, "rewards/reward_fn/mean": 0.7179300983746847, "rewards/reward_fn/std": 0.2922264536221822, "sampling/importance_sampling_ratio/max": 1.5898785988489788, "sampling/importance_sampling_ratio/mean": 0.448869655529658, "sampling/importance_sampling_ratio/min": 0.00015151862680795603, "sampling/sampling_logp_difference/max": 2.6603688398996987, "sampling/sampling_logp_difference/mean": 0.00460523118575414, "step": 4190, "step_time": 10.194913215190173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 3000.0, "completions/max_terminated_length": 1724.5, "completions/mean_length": 1160.34375, "completions/mean_terminated_length": 446.92918395996094, "completions/min_length": 91.5, "completions/min_terminated_length": 91.5, "entropy": 0.019448713213205338, "epoch": 0.5048076923076923, "frac_reward_zero_std": 0.125, "grad_norm": 0.01833544671535492, "learning_rate": 4.953125e-07, "loss": -0.0046, "num_tokens": 90549666.0, "reward": 0.6287437379360199, "reward_std": 0.3531806468963623, "rewards/reward_fn/mean": 0.6287437379360199, "rewards/reward_fn/std": 0.3531806319952011, "sampling/importance_sampling_ratio/max": 1.171753168106079, "sampling/importance_sampling_ratio/mean": 0.3965045213699341, "sampling/importance_sampling_ratio/min": 5.96166883042315e-05, "sampling/sampling_logp_difference/max": 3.880075454711914, "sampling/sampling_logp_difference/mean": 0.0043705846183001995, "step": 4200, "step_time": 6.886546530481428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3229166666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 2154.0, "completions/mean_length": 1432.40625, "completions/mean_terminated_length": 688.2176513671875, "completions/min_length": 140.33333333333334, "completions/min_terminated_length": 140.33333333333334, "entropy": 0.023557924292981626, "epoch": 0.5060096153846154, "frac_reward_zero_std": 0.0, "grad_norm": 0.014771194197237492, "learning_rate": 4.941105769230769e-07, "loss": -0.0033, "num_tokens": 90807465.0, "reward": 0.7197946707407633, "reward_std": 0.25889890392621356, "rewards/reward_fn/mean": 0.7197946707407633, "rewards/reward_fn/std": 0.25889889399210614, "sampling/importance_sampling_ratio/max": 1.3151236375172932, "sampling/importance_sampling_ratio/mean": 0.31099363168080646, "sampling/importance_sampling_ratio/min": 2.747181224549422e-05, "sampling/sampling_logp_difference/max": 2.819603125254313, "sampling/sampling_logp_difference/mean": 0.005620355096956094, "step": 4210, "step_time": 10.281682734563947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2604.0, "completions/mean_length": 1500.125, "completions/mean_terminated_length": 743.6148376464844, "completions/min_length": 142.5, "completions/min_terminated_length": 142.5, "entropy": 0.02369897738099098, "epoch": 0.5072115384615384, "frac_reward_zero_std": 0.0, "grad_norm": 0.004452145658433437, "learning_rate": 4.929086538461539e-07, "loss": -0.0173, "num_tokens": 90955329.0, "reward": 0.6302545964717865, "reward_std": 0.3047827184200287, "rewards/reward_fn/mean": 0.6302545964717865, "rewards/reward_fn/std": 0.3047827184200287, "sampling/importance_sampling_ratio/max": 1.6817407608032227, "sampling/importance_sampling_ratio/mean": 0.31086595356464386, "sampling/importance_sampling_ratio/min": 7.872120158936013e-05, "sampling/sampling_logp_difference/max": 3.1097511053085327, "sampling/sampling_logp_difference/mean": 0.00566071760840714, "step": 4220, "step_time": 6.845414185058326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2031.6666666666667, "completions/mean_length": 1197.9583333333333, "completions/mean_terminated_length": 503.8327941894531, "completions/min_length": 166.66666666666666, "completions/min_terminated_length": 166.66666666666666, "entropy": 0.02445578873157501, "epoch": 0.5084134615384616, "frac_reward_zero_std": 0.0, "grad_norm": 0.015526466071605682, "learning_rate": 4.917067307692308e-07, "loss": 0.0026, "num_tokens": 91192669.0, "reward": 0.7411717573801676, "reward_std": 0.22974329690138498, "rewards/reward_fn/mean": 0.7411717573801676, "rewards/reward_fn/std": 0.22974328696727753, "sampling/importance_sampling_ratio/max": 1.3123957713445027, "sampling/importance_sampling_ratio/mean": 0.3491006890932719, "sampling/importance_sampling_ratio/min": 9.702455747628846e-05, "sampling/sampling_logp_difference/max": 4.040042956670125, "sampling/sampling_logp_difference/mean": 0.005162875633686781, "step": 4230, "step_time": 10.097371739149093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 3000.0, "completions/max_terminated_length": 1436.0, "completions/mean_length": 1016.671875, "completions/mean_terminated_length": 420.00758361816406, "completions/min_length": 91.5, "completions/min_terminated_length": 91.5, "entropy": 0.020240560360252857, "epoch": 0.5096153846153846, "frac_reward_zero_std": 0.125, "grad_norm": 0.012402941472828388, "learning_rate": 4.905048076923076e-07, "loss": 0.0056, "num_tokens": 91332616.0, "reward": 0.6622971296310425, "reward_std": 0.308438703417778, "rewards/reward_fn/mean": 0.6622971296310425, "rewards/reward_fn/std": 0.308438703417778, "sampling/importance_sampling_ratio/max": 2.146367311477661, "sampling/importance_sampling_ratio/mean": 0.5302667617797852, "sampling/importance_sampling_ratio/min": 8.14970990177244e-05, "sampling/sampling_logp_difference/max": 1.9475505352020264, "sampling/sampling_logp_difference/mean": 0.00499401125125587, "step": 4240, "step_time": 7.2176385768689215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.22916666666666666, "completions/max_length": 3000.0, "completions/max_terminated_length": 2304.6666666666665, "completions/mean_length": 1012.4270833333334, "completions/mean_terminated_length": 420.93825276692706, "completions/min_length": 101.33333333333333, "completions/min_terminated_length": 101.33333333333333, "entropy": 0.02337054703384638, "epoch": 0.5108173076923077, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.022808559238910675, "learning_rate": 4.893028846153846e-07, "loss": 0.0121, "num_tokens": 91562881.0, "reward": 0.7025015155474345, "reward_std": 0.28929976125558216, "rewards/reward_fn/mean": 0.7025015155474345, "rewards/reward_fn/std": 0.28929975132147473, "sampling/importance_sampling_ratio/max": 1.3034795920054119, "sampling/importance_sampling_ratio/mean": 0.47178540627161664, "sampling/importance_sampling_ratio/min": 6.422310900688899e-05, "sampling/sampling_logp_difference/max": 2.4440630276997886, "sampling/sampling_logp_difference/mean": 0.00566773737470309, "step": 4250, "step_time": 10.599262138176709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2313.5, "completions/mean_length": 1426.71875, "completions/mean_terminated_length": 639.2635192871094, "completions/min_length": 124.5, "completions/min_terminated_length": 124.5, "entropy": 0.026273785531520842, "epoch": 0.5120192307692307, "frac_reward_zero_std": 0.0, "grad_norm": 0.009776204824447632, "learning_rate": 4.881009615384615e-07, "loss": -0.0024, "num_tokens": 91727799.0, "reward": 0.7149775624275208, "reward_std": 0.25966402888298035, "rewards/reward_fn/mean": 0.7149775624275208, "rewards/reward_fn/std": 0.25966402143239975, "sampling/importance_sampling_ratio/max": 1.1204203963279724, "sampling/importance_sampling_ratio/mean": 0.2884763926267624, "sampling/importance_sampling_ratio/min": 4.710630310000852e-05, "sampling/sampling_logp_difference/max": 22.590437650680542, "sampling/sampling_logp_difference/mean": 0.00641404720954597, "step": 4260, "step_time": 7.054881660453975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2604166666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 2464.0, "completions/mean_length": 1284.2395833333333, "completions/mean_terminated_length": 700.4779154459635, "completions/min_length": 136.66666666666666, "completions/min_terminated_length": 136.66666666666666, "entropy": 0.019007045961916446, "epoch": 0.5132211538461539, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.008520356379449368, "learning_rate": 4.868990384615385e-07, "loss": 0.0043, "num_tokens": 91951702.0, "reward": 0.5932111938794454, "reward_std": 0.3680407206217448, "rewards/reward_fn/mean": 0.5932111938794454, "rewards/reward_fn/std": 0.36804070075352985, "sampling/importance_sampling_ratio/max": 1.8886748154958088, "sampling/importance_sampling_ratio/mean": 0.417229304711024, "sampling/importance_sampling_ratio/min": 0.0002440561875118874, "sampling/sampling_logp_difference/max": 2.9998710552851358, "sampling/sampling_logp_difference/mean": 0.0042762548352281255, "step": 4270, "step_time": 10.239896729867905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 3000.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 849.140625, "completions/mean_terminated_length": 300.3935241699219, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.024918042682111265, "epoch": 0.5144230769230769, "frac_reward_zero_std": 0.0, "grad_norm": 0.0075174542143940926, "learning_rate": 4.856971153846154e-07, "loss": -0.0047, "num_tokens": 92066263.0, "reward": 0.7279027104377747, "reward_std": 0.30332329869270325, "rewards/reward_fn/mean": 0.7279027104377747, "rewards/reward_fn/std": 0.30332328379154205, "sampling/importance_sampling_ratio/max": 1.516152799129486, "sampling/importance_sampling_ratio/mean": 0.46306468546390533, "sampling/importance_sampling_ratio/min": 0.0001897960955830058, "sampling/sampling_logp_difference/max": 2.121557593345642, "sampling/sampling_logp_difference/mean": 0.006335486425086856, "step": 4280, "step_time": 6.816265312582255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.19791666666666666, "completions/max_length": 3000.0, "completions/max_terminated_length": 1742.3333333333333, "completions/mean_length": 900.4166666666666, "completions/mean_terminated_length": 400.0745391845703, "completions/min_length": 132.33333333333334, "completions/min_terminated_length": 132.33333333333334, "entropy": 0.022458894550800322, "epoch": 0.515625, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.0134509252384305, "learning_rate": 4.844951923076923e-07, "loss": -0.0006, "num_tokens": 92244143.0, "reward": 0.6464477280775706, "reward_std": 0.31056642532348633, "rewards/reward_fn/mean": 0.6464477280775706, "rewards/reward_fn/std": 0.31056641538937885, "sampling/importance_sampling_ratio/max": 0.9750884572664896, "sampling/importance_sampling_ratio/mean": 0.4069388310114543, "sampling/importance_sampling_ratio/min": 0.00011307646066901118, "sampling/sampling_logp_difference/max": 2.29990816116333, "sampling/sampling_logp_difference/mean": 0.005195819151898225, "step": 4290, "step_time": 10.117703214660287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 3000.0, "completions/max_terminated_length": 2530.0, "completions/mean_length": 1076.859375, "completions/mean_terminated_length": 538.3799743652344, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.02717800848186016, "epoch": 0.5168269230769231, "frac_reward_zero_std": 0.0, "grad_norm": 0.013032936491072178, "learning_rate": 4.832932692307692e-07, "loss": -0.0078, "num_tokens": 92390782.0, "reward": 0.7750627100467682, "reward_std": 0.24803828448057175, "rewards/reward_fn/mean": 0.7750627100467682, "rewards/reward_fn/std": 0.24803827702999115, "sampling/importance_sampling_ratio/max": 1.7320418953895569, "sampling/importance_sampling_ratio/mean": 0.37841343879699707, "sampling/importance_sampling_ratio/min": 5.611892902379623e-06, "sampling/sampling_logp_difference/max": 1.6831782460212708, "sampling/sampling_logp_difference/mean": 0.005587429739534855, "step": 4300, "step_time": 7.094810607843101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.17708333333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 2451.3333333333335, "completions/mean_length": 1137.9895833333333, "completions/mean_terminated_length": 734.9545694986979, "completions/min_length": 104.66666666666667, "completions/min_terminated_length": 104.66666666666667, "entropy": 0.023456978611648083, "epoch": 0.5180288461538461, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.004202248994261026, "learning_rate": 4.820913461538461e-07, "loss": 0.0012, "num_tokens": 92579533.0, "reward": 0.7113542755444845, "reward_std": 0.3109319309393565, "rewards/reward_fn/mean": 0.7113542755444845, "rewards/reward_fn/std": 0.3109319359064102, "sampling/importance_sampling_ratio/max": 1.69949205716451, "sampling/importance_sampling_ratio/mean": 0.4206385016441345, "sampling/importance_sampling_ratio/min": 5.694927419123512e-06, "sampling/sampling_logp_difference/max": 1.7969046433766682, "sampling/sampling_logp_difference/mean": 0.005200351433207591, "step": 4310, "step_time": 10.135046657640487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 3000.0, "completions/max_terminated_length": 1884.5, "completions/mean_length": 1542.265625, "completions/mean_terminated_length": 612.7816009521484, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.021229505911469458, "epoch": 0.5192307692307693, "frac_reward_zero_std": 0.0, "grad_norm": 0.006279489025473595, "learning_rate": 4.808894230769231e-07, "loss": -0.0041, "num_tokens": 92759678.0, "reward": 0.6699522733688354, "reward_std": 0.30058975517749786, "rewards/reward_fn/mean": 0.6699522733688354, "rewards/reward_fn/std": 0.30058974027633667, "sampling/importance_sampling_ratio/max": 1.8802511096000671, "sampling/importance_sampling_ratio/mean": 0.3432048112154007, "sampling/importance_sampling_ratio/min": 2.9271672190134268e-05, "sampling/sampling_logp_difference/max": 4.613308310508728, "sampling/sampling_logp_difference/mean": 0.005128186894580722, "step": 4320, "step_time": 7.253895723354072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.17708333333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 2258.0, "completions/mean_length": 915.0416666666666, "completions/mean_terminated_length": 462.5387776692708, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.02426688838750124, "epoch": 0.5204326923076923, "frac_reward_zero_std": 0.0, "grad_norm": 0.00818629376590252, "learning_rate": 4.796875e-07, "loss": -0.0008, "num_tokens": 92956418.0, "reward": 0.7928298513094584, "reward_std": 0.22139442960421243, "rewards/reward_fn/mean": 0.7928298513094584, "rewards/reward_fn/std": 0.22139443457126617, "sampling/importance_sampling_ratio/max": 1.181322991847992, "sampling/importance_sampling_ratio/mean": 0.4001128176848094, "sampling/importance_sampling_ratio/min": 0.0001828558641439789, "sampling/sampling_logp_difference/max": 1.812279224395752, "sampling/sampling_logp_difference/mean": 0.0052497076491514845, "step": 4330, "step_time": 10.364325905311853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 3000.0, "completions/max_terminated_length": 2166.5, "completions/mean_length": 920.34375, "completions/mean_terminated_length": 487.61785888671875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.024056841246783735, "epoch": 0.5216346153846154, "frac_reward_zero_std": 0.0, "grad_norm": 0.008673143573105335, "learning_rate": 4.784855769230768e-07, "loss": 0.0013, "num_tokens": 93104080.0, "reward": 0.8078491985797882, "reward_std": 0.20928437262773514, "rewards/reward_fn/mean": 0.8078491985797882, "rewards/reward_fn/std": 0.20928436517715454, "sampling/importance_sampling_ratio/max": 2.1951653957366943, "sampling/importance_sampling_ratio/mean": 0.4899916648864746, "sampling/importance_sampling_ratio/min": 0.0005795077704533469, "sampling/sampling_logp_difference/max": 1.6843767166137695, "sampling/sampling_logp_difference/mean": 0.005029693711549044, "step": 4340, "step_time": 7.041948760021478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3020833333333333, "completions/max_length": 3000.0, "completions/max_terminated_length": 2118.3333333333335, "completions/mean_length": 1268.8333333333333, "completions/mean_terminated_length": 520.5223693847656, "completions/min_length": 122.33333333333333, "completions/min_terminated_length": 122.33333333333333, "entropy": 0.02129550389945507, "epoch": 0.5228365384615384, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.002746812766417861, "learning_rate": 4.772836538461538e-07, "loss": 0.0182, "num_tokens": 93463216.0, "reward": 0.6055963635444641, "reward_std": 0.3283649981021881, "rewards/reward_fn/mean": 0.6055963635444641, "rewards/reward_fn/std": 0.3283649981021881, "sampling/importance_sampling_ratio/max": 1.6137237946192424, "sampling/importance_sampling_ratio/mean": 0.4051678280035655, "sampling/importance_sampling_ratio/min": 7.072509230987567e-06, "sampling/sampling_logp_difference/max": 8.519267876942953, "sampling/sampling_logp_difference/mean": 0.005310297633210818, "step": 4350, "step_time": 13.158957571722567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 3000.0, "completions/max_terminated_length": 1512.5, "completions/mean_length": 945.78125, "completions/mean_terminated_length": 368.8808288574219, "completions/min_length": 124.5, "completions/min_terminated_length": 124.5, "entropy": 0.022087777405977248, "epoch": 0.5240384615384616, "frac_reward_zero_std": 0.125, "grad_norm": 0.013819965533912182, "learning_rate": 4.7608173076923074e-07, "loss": -0.0107, "num_tokens": 93593322.0, "reward": 0.6702144742012024, "reward_std": 0.3113681823015213, "rewards/reward_fn/mean": 0.6702144742012024, "rewards/reward_fn/std": 0.3113681748509407, "sampling/importance_sampling_ratio/max": 1.383341908454895, "sampling/importance_sampling_ratio/mean": 0.5090738832950592, "sampling/importance_sampling_ratio/min": 0.00019339491927894414, "sampling/sampling_logp_difference/max": 3.347282886505127, "sampling/sampling_logp_difference/mean": 0.004567411495372653, "step": 4360, "step_time": 8.237524408567698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 3000.0, "completions/max_terminated_length": 1706.6666666666667, "completions/mean_length": 1224.1666666666667, "completions/mean_terminated_length": 529.0260721842448, "completions/min_length": 123.66666666666667, "completions/min_terminated_length": 123.66666666666667, "entropy": 0.026928636990487576, "epoch": 0.5252403846153846, "frac_reward_zero_std": 0.0, "grad_norm": 0.008380667306482792, "learning_rate": 4.748798076923077e-07, "loss": 0.01, "num_tokens": 93822122.0, "reward": 0.7336420019467672, "reward_std": 0.25950386623541516, "rewards/reward_fn/mean": 0.7336420019467672, "rewards/reward_fn/std": 0.2595038563013077, "sampling/importance_sampling_ratio/max": 1.4631431500116985, "sampling/importance_sampling_ratio/mean": 0.32146891951560974, "sampling/importance_sampling_ratio/min": 3.1072332452216265e-05, "sampling/sampling_logp_difference/max": 2.126885970433553, "sampling/sampling_logp_difference/mean": 0.005595130069802205, "step": 4370, "step_time": 10.233232573699206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 3000.0, "completions/max_terminated_length": 1119.0, "completions/mean_length": 966.78125, "completions/mean_terminated_length": 361.17759704589844, "completions/min_length": 127.5, "completions/min_terminated_length": 127.5, "entropy": 0.02560363579541445, "epoch": 0.5264423076923077, "frac_reward_zero_std": 0.0, "grad_norm": 0.008985430002212524, "learning_rate": 4.736778846153846e-07, "loss": -0.0028, "num_tokens": 93963020.0, "reward": 0.77187180519104, "reward_std": 0.23135054111480713, "rewards/reward_fn/mean": 0.77187180519104, "rewards/reward_fn/std": 0.23135053366422653, "sampling/importance_sampling_ratio/max": 1.2528004050254822, "sampling/importance_sampling_ratio/mean": 0.4802455008029938, "sampling/importance_sampling_ratio/min": 0.000398811620698325, "sampling/sampling_logp_difference/max": 1.4779755473136902, "sampling/sampling_logp_difference/mean": 0.0051768189296126366, "step": 4380, "step_time": 6.912096271663904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.22916666666666666, "completions/max_length": 3000.0, "completions/max_terminated_length": 2608.3333333333335, "completions/mean_length": 1111.6979166666667, "completions/mean_terminated_length": 545.398203531901, "completions/min_length": 82.33333333333333, "completions/min_terminated_length": 82.33333333333333, "entropy": 0.024230160750448704, "epoch": 0.5276442307692307, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.0048562511801719666, "learning_rate": 4.724759615384615e-07, "loss": -0.0022, "num_tokens": 94174887.0, "reward": 0.7656090259552002, "reward_std": 0.2510354071855545, "rewards/reward_fn/mean": 0.7656090259552002, "rewards/reward_fn/std": 0.2510354071855545, "sampling/importance_sampling_ratio/max": 1.4366144339243572, "sampling/importance_sampling_ratio/mean": 0.40560545523961383, "sampling/importance_sampling_ratio/min": 0.0001646169511673179, "sampling/sampling_logp_difference/max": 2.6547465324401855, "sampling/sampling_logp_difference/mean": 0.004951813103010257, "step": 4390, "step_time": 10.321976816467942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2677.0, "completions/mean_length": 1007.53125, "completions/mean_terminated_length": 505.12501525878906, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.01948415543884039, "epoch": 0.5288461538461539, "frac_reward_zero_std": 0.25, "grad_norm": 0.0020061470568180084, "learning_rate": 4.7127403846153845e-07, "loss": -0.0035, "num_tokens": 94300033.0, "reward": 0.5952139645814896, "reward_std": 0.25964945554733276, "rewards/reward_fn/mean": 0.5952139645814896, "rewards/reward_fn/std": 0.25964945554733276, "sampling/importance_sampling_ratio/max": 1.2655028104782104, "sampling/importance_sampling_ratio/mean": 0.4790639728307724, "sampling/importance_sampling_ratio/min": 8.135558562116785e-05, "sampling/sampling_logp_difference/max": 3.2348896265029907, "sampling/sampling_logp_difference/mean": 0.004329811432398856, "step": 4400, "step_time": 6.9600397374480965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 3000.0, "completions/max_terminated_length": 2059.0, "completions/mean_length": 941.34375, "completions/mean_terminated_length": 560.1437174479166, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.025574211589992048, "epoch": 0.5300480769230769, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.008321749977767467, "learning_rate": 4.700721153846154e-07, "loss": -0.0047, "num_tokens": 94496042.0, "reward": 0.7430392305056254, "reward_std": 0.2656444311141968, "rewards/reward_fn/mean": 0.7430392305056254, "rewards/reward_fn/std": 0.26564442614714306, "sampling/importance_sampling_ratio/max": 1.308640976746877, "sampling/importance_sampling_ratio/mean": 0.33803210655848187, "sampling/importance_sampling_ratio/min": 0.00011608945654491738, "sampling/sampling_logp_difference/max": 2.189754764238993, "sampling/sampling_logp_difference/mean": 0.006201072906454404, "step": 4410, "step_time": 9.991434176731854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 3000.0, "completions/max_terminated_length": 1514.0, "completions/mean_length": 827.171875, "completions/mean_terminated_length": 399.5166702270508, "completions/min_length": 172.5, "completions/min_terminated_length": 172.5, "entropy": 0.024277307838201524, "epoch": 0.53125, "frac_reward_zero_std": 0.25, "grad_norm": 0.004592087119817734, "learning_rate": 4.6887019230769234e-07, "loss": -0.0028, "num_tokens": 94617621.0, "reward": 0.6745665371417999, "reward_std": 0.32506389915943146, "rewards/reward_fn/mean": 0.6745665371417999, "rewards/reward_fn/std": 0.32506391406059265, "sampling/importance_sampling_ratio/max": 1.0950633585453033, "sampling/importance_sampling_ratio/mean": 0.4109777361154556, "sampling/importance_sampling_ratio/min": 1.839846254370059e-05, "sampling/sampling_logp_difference/max": 1.6190252900123596, "sampling/sampling_logp_difference/mean": 0.004995835246518254, "step": 4420, "step_time": 6.946923177968711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3645833333333333, "completions/max_length": 3000.0, "completions/max_terminated_length": 2389.3333333333335, "completions/mean_length": 1519.8541666666667, "completions/mean_terminated_length": 686.3839416503906, "completions/min_length": 110.33333333333333, "completions/min_terminated_length": 110.33333333333333, "entropy": 0.02035675011575222, "epoch": 0.5324519230769231, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.0033804667182266712, "learning_rate": 4.676682692307692e-07, "loss": 0.0019, "num_tokens": 94879063.0, "reward": 0.5929886996746063, "reward_std": 0.3149074614048004, "rewards/reward_fn/mean": 0.5929886996746063, "rewards/reward_fn/std": 0.3149074614048004, "sampling/importance_sampling_ratio/max": 1.7200231154759724, "sampling/importance_sampling_ratio/mean": 0.3391305555899938, "sampling/importance_sampling_ratio/min": 2.1920827634858142e-05, "sampling/sampling_logp_difference/max": 4.074146747589111, "sampling/sampling_logp_difference/mean": 0.004544524941593409, "step": 4430, "step_time": 10.513655652850867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2611.5, "completions/mean_length": 1310.71875, "completions/mean_terminated_length": 513.1357727050781, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.023631223291158677, "epoch": 0.5336538461538461, "frac_reward_zero_std": 0.125, "grad_norm": 0.001393849728628993, "learning_rate": 4.664663461538461e-07, "loss": 0.0012, "num_tokens": 95025021.0, "reward": 0.6926092505455017, "reward_std": 0.2647063136100769, "rewards/reward_fn/mean": 0.6926092505455017, "rewards/reward_fn/std": 0.2647062838077545, "sampling/importance_sampling_ratio/max": 1.8770534992218018, "sampling/importance_sampling_ratio/mean": 0.45554183423519135, "sampling/importance_sampling_ratio/min": 0.0011417887565130513, "sampling/sampling_logp_difference/max": 2.156668782234192, "sampling/sampling_logp_difference/mean": 0.005240712082013488, "step": 4440, "step_time": 7.21284160586074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.20833333333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 1999.3333333333333, "completions/mean_length": 1047.5729166666667, "completions/mean_terminated_length": 526.5448913574219, "completions/min_length": 121.33333333333333, "completions/min_terminated_length": 121.33333333333333, "entropy": 0.02667562961578369, "epoch": 0.5348557692307693, "frac_reward_zero_std": 0.0, "grad_norm": 0.010900652967393398, "learning_rate": 4.6526442307692306e-07, "loss": -0.0037, "num_tokens": 95243836.0, "reward": 0.7340333263079325, "reward_std": 0.2668135166168213, "rewards/reward_fn/mean": 0.7340333263079325, "rewards/reward_fn/std": 0.2668135116497676, "sampling/importance_sampling_ratio/max": 1.6613211234410603, "sampling/importance_sampling_ratio/mean": 0.39736006657282513, "sampling/importance_sampling_ratio/min": 4.152278044481742e-05, "sampling/sampling_logp_difference/max": 2.6712234814961753, "sampling/sampling_logp_difference/mean": 0.005869806123276551, "step": 4450, "step_time": 10.269270282238722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2364.5, "completions/mean_length": 1025.8125, "completions/mean_terminated_length": 428.7659912109375, "completions/min_length": 112.5, "completions/min_terminated_length": 112.5, "entropy": 0.023943454772233964, "epoch": 0.5360576923076923, "frac_reward_zero_std": 0.125, "grad_norm": 0.022407179698348045, "learning_rate": 4.6406249999999995e-07, "loss": -0.0031, "num_tokens": 95370376.0, "reward": 0.6940383017063141, "reward_std": 0.2974670082330704, "rewards/reward_fn/mean": 0.6940383017063141, "rewards/reward_fn/std": 0.29746702313423157, "sampling/importance_sampling_ratio/max": 1.3114949464797974, "sampling/importance_sampling_ratio/mean": 0.4797673672437668, "sampling/importance_sampling_ratio/min": 2.153201194232679e-06, "sampling/sampling_logp_difference/max": 1.4852725267410278, "sampling/sampling_logp_difference/mean": 0.0053761068265885115, "step": 4460, "step_time": 6.912623698357493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3000.0, "completions/max_terminated_length": 2438.3333333333335, "completions/mean_length": 1167.7708333333333, "completions/mean_terminated_length": 564.8798828125, "completions/min_length": 127.66666666666667, "completions/min_terminated_length": 127.66666666666667, "entropy": 0.02422673236578703, "epoch": 0.5372596153846154, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.010542493313550949, "learning_rate": 4.628605769230769e-07, "loss": -0.0083, "num_tokens": 95600914.0, "reward": 0.6956796050071716, "reward_std": 0.2682098001241684, "rewards/reward_fn/mean": 0.6956796050071716, "rewards/reward_fn/std": 0.2682097802559535, "sampling/importance_sampling_ratio/max": 2.254066268603007, "sampling/importance_sampling_ratio/mean": 0.41750941673914593, "sampling/importance_sampling_ratio/min": 0.001332179856641839, "sampling/sampling_logp_difference/max": 2.260657032330831, "sampling/sampling_logp_difference/mean": 0.005326349598666032, "step": 4470, "step_time": 10.459924803953617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 3000.0, "completions/max_terminated_length": 1649.0, "completions/mean_length": 1037.046875, "completions/mean_terminated_length": 460.125, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.02713506668806076, "epoch": 0.5384615384615384, "frac_reward_zero_std": 0.0, "grad_norm": 0.005397660192102194, "learning_rate": 4.6165865384615384e-07, "loss": -0.0093, "num_tokens": 95740677.0, "reward": 0.7074108421802521, "reward_std": 0.2447560578584671, "rewards/reward_fn/mean": 0.7074108421802521, "rewards/reward_fn/std": 0.2447560429573059, "sampling/importance_sampling_ratio/max": 1.4325000047683716, "sampling/importance_sampling_ratio/mean": 0.32816600799560547, "sampling/importance_sampling_ratio/min": 7.892935173003934e-06, "sampling/sampling_logp_difference/max": 3.7436124086380005, "sampling/sampling_logp_difference/mean": 0.005510026356205344, "step": 4480, "step_time": 6.931928768660873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3000.0, "completions/max_terminated_length": 1906.6666666666667, "completions/mean_length": 1100.7083333333333, "completions/mean_terminated_length": 516.1423492431641, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.018588326033204795, "epoch": 0.5396634615384616, "frac_reward_zero_std": 0.0, "grad_norm": 0.009800145402550697, "learning_rate": 4.604567307692307e-07, "loss": -0.0082, "num_tokens": 95958705.0, "reward": 0.7666075229644775, "reward_std": 0.217372494439284, "rewards/reward_fn/mean": 0.7666075229644775, "rewards/reward_fn/std": 0.2173725018898646, "sampling/importance_sampling_ratio/max": 1.3542592128117878, "sampling/importance_sampling_ratio/mean": 0.4502638876438141, "sampling/importance_sampling_ratio/min": 0.0011193359423486982, "sampling/sampling_logp_difference/max": 1.663058598836263, "sampling/sampling_logp_difference/mean": 0.004407102552553018, "step": 4490, "step_time": 10.205890802666545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3000.0, "completions/max_terminated_length": 1922.5, "completions/mean_length": 1118.65625, "completions/mean_terminated_length": 479.1573486328125, "completions/min_length": 126.5, "completions/min_terminated_length": 126.5, "entropy": 0.02456192709505558, "epoch": 0.5408653846153846, "frac_reward_zero_std": 0.125, "grad_norm": 0.0047024087980389595, "learning_rate": 4.5925480769230767e-07, "loss": 0.0004, "num_tokens": 96108627.0, "reward": 0.6102513670921326, "reward_std": 0.3304799199104309, "rewards/reward_fn/mean": 0.6102513670921326, "rewards/reward_fn/std": 0.3304799050092697, "sampling/importance_sampling_ratio/max": 1.299405574798584, "sampling/importance_sampling_ratio/mean": 0.3935699611902237, "sampling/importance_sampling_ratio/min": 2.3038870494929142e-05, "sampling/sampling_logp_difference/max": 1.9591262340545654, "sampling/sampling_logp_difference/mean": 0.005477376282215118, "step": 4500, "step_time": 7.175712369009853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 3000.0, "completions/max_terminated_length": 2001.6666666666667, "completions/mean_length": 1012.0520833333334, "completions/mean_terminated_length": 448.26842244466144, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.022729031555354596, "epoch": 0.5420673076923077, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.04126787558197975, "learning_rate": 4.580528846153846e-07, "loss": -0.0105, "num_tokens": 96315256.0, "reward": 0.6769774556159973, "reward_std": 0.32532623410224915, "rewards/reward_fn/mean": 0.6769774556159973, "rewards/reward_fn/std": 0.32532623410224915, "sampling/importance_sampling_ratio/max": 2.2008567651112876, "sampling/importance_sampling_ratio/mean": 0.4372008442878723, "sampling/importance_sampling_ratio/min": 0.0002881079502306723, "sampling/sampling_logp_difference/max": 2.7277565797170005, "sampling/sampling_logp_difference/mean": 0.005589481443166733, "step": 4510, "step_time": 10.201183252036572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 3000.0, "completions/max_terminated_length": 1466.5, "completions/mean_length": 1279.53125, "completions/mean_terminated_length": 438.78839111328125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.021475859545171262, "epoch": 0.5432692307692307, "frac_reward_zero_std": 0.0, "grad_norm": 0.021777616813778877, "learning_rate": 4.5685096153846155e-07, "loss": -0.012, "num_tokens": 96474466.0, "reward": 0.6680465340614319, "reward_std": 0.2898593842983246, "rewards/reward_fn/mean": 0.6680465340614319, "rewards/reward_fn/std": 0.2898593693971634, "sampling/importance_sampling_ratio/max": 1.253152310848236, "sampling/importance_sampling_ratio/mean": 0.3522329181432724, "sampling/importance_sampling_ratio/min": 9.963031334336847e-05, "sampling/sampling_logp_difference/max": 2.900739908218384, "sampling/sampling_logp_difference/mean": 0.004471097839996219, "step": 4520, "step_time": 6.950342418625951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2916666666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 1837.6666666666667, "completions/mean_length": 1263.3125, "completions/mean_terminated_length": 586.8223775227865, "completions/min_length": 124.33333333333333, "completions/min_terminated_length": 124.33333333333333, "entropy": 0.024597142077982425, "epoch": 0.5444711538461539, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.0075026825070381165, "learning_rate": 4.5564903846153844e-07, "loss": -0.0025, "num_tokens": 96713424.0, "reward": 0.6589117050170898, "reward_std": 0.30766181151072186, "rewards/reward_fn/mean": 0.6589117050170898, "rewards/reward_fn/std": 0.3076617916425069, "sampling/importance_sampling_ratio/max": 0.9573721686999003, "sampling/importance_sampling_ratio/mean": 0.25919008255004883, "sampling/importance_sampling_ratio/min": 0.0004140347112600769, "sampling/sampling_logp_difference/max": 2.126110633214315, "sampling/sampling_logp_difference/mean": 0.005089471892764171, "step": 4530, "step_time": 10.34036869732663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 3000.0, "completions/max_terminated_length": 2044.5, "completions/mean_length": 1044.84375, "completions/mean_terminated_length": 500.6542510986328, "completions/min_length": 163.5, "completions/min_terminated_length": 163.5, "entropy": 0.024125390872359275, "epoch": 0.5456730769230769, "frac_reward_zero_std": 0.0, "grad_norm": 0.012271769344806671, "learning_rate": 4.544471153846154e-07, "loss": -0.0088, "num_tokens": 96840606.0, "reward": 0.753156840801239, "reward_std": 0.2494831457734108, "rewards/reward_fn/mean": 0.753156840801239, "rewards/reward_fn/std": 0.2494831457734108, "sampling/importance_sampling_ratio/max": 2.072512924671173, "sampling/importance_sampling_ratio/mean": 0.41581158339977264, "sampling/importance_sampling_ratio/min": 3.819094672508072e-05, "sampling/sampling_logp_difference/max": 3.217781722545624, "sampling/sampling_logp_difference/mean": 0.00581859122030437, "step": 4540, "step_time": 7.108623884245754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 3000.0, "completions/max_terminated_length": 1940.6666666666667, "completions/mean_length": 1449.0833333333333, "completions/mean_terminated_length": 650.8119812011719, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.021285860799252987, "epoch": 0.546875, "frac_reward_zero_std": 0.0, "grad_norm": 0.005973689258098602, "learning_rate": 4.5324519230769233e-07, "loss": 0.0006, "num_tokens": 97107478.0, "reward": 0.6872909069061279, "reward_std": 0.26954660813013714, "rewards/reward_fn/mean": 0.6872909069061279, "rewards/reward_fn/std": 0.26954660813013714, "sampling/importance_sampling_ratio/max": 1.5216794808705647, "sampling/importance_sampling_ratio/mean": 0.3539614776770274, "sampling/importance_sampling_ratio/min": 1.8451291983486346e-05, "sampling/sampling_logp_difference/max": 2.724564472834269, "sampling/sampling_logp_difference/mean": 0.004899316777785619, "step": 4550, "step_time": 10.449900711420923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 3000.0, "completions/max_terminated_length": 1748.0, "completions/mean_length": 861.328125, "completions/mean_terminated_length": 368.5696258544922, "completions/min_length": 116.5, "completions/min_terminated_length": 116.5, "entropy": 0.020207677967846394, "epoch": 0.5480769230769231, "frac_reward_zero_std": 0.125, "grad_norm": 0.002796800574287772, "learning_rate": 4.5204326923076917e-07, "loss": -0.002, "num_tokens": 97232691.0, "reward": 0.7761774957180023, "reward_std": 0.21673373878002167, "rewards/reward_fn/mean": 0.7761774957180023, "rewards/reward_fn/std": 0.21673372387886047, "sampling/importance_sampling_ratio/max": 2.0050246715545654, "sampling/importance_sampling_ratio/mean": 0.5797117352485657, "sampling/importance_sampling_ratio/min": 0.00038206481258384883, "sampling/sampling_logp_difference/max": 1.3536514639854431, "sampling/sampling_logp_difference/mean": 0.004434501985087991, "step": 4560, "step_time": 6.953369270265102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2604166666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 2448.3333333333335, "completions/mean_length": 1310.03125, "completions/mean_terminated_length": 726.3672688802084, "completions/min_length": 152.66666666666666, "completions/min_terminated_length": 152.66666666666666, "entropy": 0.024047151766717435, "epoch": 0.5492788461538461, "frac_reward_zero_std": 0.0, "grad_norm": 0.028129171580076218, "learning_rate": 4.508413461538461e-07, "loss": -0.0072, "num_tokens": 97448342.0, "reward": 0.7264581918716431, "reward_std": 0.22521240015824637, "rewards/reward_fn/mean": 0.7264581918716431, "rewards/reward_fn/std": 0.22521240015824637, "sampling/importance_sampling_ratio/max": 1.0154427687327068, "sampling/importance_sampling_ratio/mean": 0.31178271273771924, "sampling/importance_sampling_ratio/min": 4.8360206430212806e-05, "sampling/sampling_logp_difference/max": 2.6293293635050454, "sampling/sampling_logp_difference/mean": 0.005382250839223464, "step": 4570, "step_time": 10.3879783350043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2588.0, "completions/mean_length": 1077.59375, "completions/mean_terminated_length": 587.8168029785156, "completions/min_length": 95.5, "completions/min_terminated_length": 95.5, "entropy": 0.020426085591316222, "epoch": 0.5504807692307693, "frac_reward_zero_std": 0.125, "grad_norm": 0.009132100269198418, "learning_rate": 4.4963942307692305e-07, "loss": -0.005, "num_tokens": 97572764.0, "reward": 0.5928994864225388, "reward_std": 0.31692657619714737, "rewards/reward_fn/mean": 0.5928994864225388, "rewards/reward_fn/std": 0.3169265612959862, "sampling/importance_sampling_ratio/max": 1.2793212532997131, "sampling/importance_sampling_ratio/mean": 0.41821205615997314, "sampling/importance_sampling_ratio/min": 1.245721659870469e-05, "sampling/sampling_logp_difference/max": 7.797086954116821, "sampling/sampling_logp_difference/mean": 0.004426859202794731, "step": 4580, "step_time": 7.057648047618568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2916666666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 2412.3333333333335, "completions/mean_length": 1266.2708333333333, "completions/mean_terminated_length": 556.1277770996094, "completions/min_length": 180.33333333333334, "completions/min_terminated_length": 180.33333333333334, "entropy": 0.02362676952034235, "epoch": 0.5516826923076923, "frac_reward_zero_std": 0.0, "grad_norm": 0.011500795371830463, "learning_rate": 4.484375e-07, "loss": 0.0003, "num_tokens": 97813806.0, "reward": 0.7097757856051127, "reward_std": 0.25495514770348865, "rewards/reward_fn/mean": 0.7097757856051127, "rewards/reward_fn/std": 0.25495515763759613, "sampling/importance_sampling_ratio/max": 1.7101809581120808, "sampling/importance_sampling_ratio/mean": 0.350583756963412, "sampling/importance_sampling_ratio/min": 0.00020626901095965877, "sampling/sampling_logp_difference/max": 2.6492404540379844, "sampling/sampling_logp_difference/mean": 0.0047610037339230376, "step": 4590, "step_time": 10.421243258938194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 3000.0, "completions/max_terminated_length": 1957.0, "completions/mean_length": 703.1875, "completions/mean_terminated_length": 330.4630432128906, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.021147100441157817, "epoch": 0.5528846153846154, "frac_reward_zero_std": 0.0, "grad_norm": 0.028241213411092758, "learning_rate": 4.472355769230769e-07, "loss": -0.0004, "num_tokens": 97924042.0, "reward": 0.803556352853775, "reward_std": 0.2279963344335556, "rewards/reward_fn/mean": 0.803556352853775, "rewards/reward_fn/std": 0.2279963195323944, "sampling/importance_sampling_ratio/max": 1.5661540031433105, "sampling/importance_sampling_ratio/mean": 0.5582396686077118, "sampling/importance_sampling_ratio/min": 0.0001103824470192194, "sampling/sampling_logp_difference/max": 2.4466747045516968, "sampling/sampling_logp_difference/mean": 0.004647887544706464, "step": 4600, "step_time": 6.72941275537014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.23958333333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 1514.3333333333333, "completions/mean_length": 1058.1354166666667, "completions/mean_terminated_length": 457.3768717447917, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.023555009812116622, "epoch": 0.5540865384615384, "frac_reward_zero_std": 0.0, "grad_norm": 0.0022173086181282997, "learning_rate": 4.4603365384615383e-07, "loss": -0.0106, "num_tokens": 98130239.0, "reward": 0.7624485890070597, "reward_std": 0.24044211208820343, "rewards/reward_fn/mean": 0.7624485890070597, "rewards/reward_fn/std": 0.2404421071211497, "sampling/importance_sampling_ratio/max": 1.4324966271718342, "sampling/importance_sampling_ratio/mean": 0.40232348442077637, "sampling/importance_sampling_ratio/min": 6.08980541200547e-05, "sampling/sampling_logp_difference/max": 5.1547798315684, "sampling/sampling_logp_difference/mean": 0.005013307323679328, "step": 4610, "step_time": 10.222906653769314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 3000.0, "completions/max_terminated_length": 1491.0, "completions/mean_length": 649.21875, "completions/mean_terminated_length": 362.2536926269531, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.021703909896314145, "epoch": 0.5552884615384616, "frac_reward_zero_std": 0.125, "grad_norm": 0.04720210283994675, "learning_rate": 4.4483173076923077e-07, "loss": -0.0056, "num_tokens": 98237149.0, "reward": 0.7623602449893951, "reward_std": 0.2878428250551224, "rewards/reward_fn/mean": 0.7623602449893951, "rewards/reward_fn/std": 0.2878428250551224, "sampling/importance_sampling_ratio/max": 1.5286403894424438, "sampling/importance_sampling_ratio/mean": 0.5899020731449127, "sampling/importance_sampling_ratio/min": 2.0297256412504794e-05, "sampling/sampling_logp_difference/max": 2.3600791692733765, "sampling/sampling_logp_difference/mean": 0.005028430838137865, "step": 4620, "step_time": 6.740511745028198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.22916666666666666, "completions/max_length": 3000.0, "completions/max_terminated_length": 2626.3333333333335, "completions/mean_length": 1171.2916666666667, "completions/mean_terminated_length": 637.6227722167969, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.02422990817576647, "epoch": 0.5564903846153846, "frac_reward_zero_std": 0.0, "grad_norm": 0.005085106939077377, "learning_rate": 4.4362980769230766e-07, "loss": -0.0035, "num_tokens": 98465257.0, "reward": 0.7481837073961893, "reward_std": 0.25569955507914227, "rewards/reward_fn/mean": 0.7481837073961893, "rewards/reward_fn/std": 0.2556995501120885, "sampling/importance_sampling_ratio/max": 2.1638811826705933, "sampling/importance_sampling_ratio/mean": 0.3727966944376628, "sampling/importance_sampling_ratio/min": 0.00011558301290885235, "sampling/sampling_logp_difference/max": 2.5198943614959717, "sampling/sampling_logp_difference/mean": 0.005379065250356992, "step": 4630, "step_time": 10.436814456246793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 3000.0, "completions/max_terminated_length": 2375.5, "completions/mean_length": 999.484375, "completions/mean_terminated_length": 538.9710998535156, "completions/min_length": 119.5, "completions/min_terminated_length": 119.5, "entropy": 0.027212028205394746, "epoch": 0.5576923076923077, "frac_reward_zero_std": 0.0, "grad_norm": 0.02135961316525936, "learning_rate": 4.424278846153846e-07, "loss": -0.0063, "num_tokens": 98618640.0, "reward": 0.7678124010562897, "reward_std": 0.2172674536705017, "rewards/reward_fn/mean": 0.7678124010562897, "rewards/reward_fn/std": 0.21726743876934052, "sampling/importance_sampling_ratio/max": 1.474372684955597, "sampling/importance_sampling_ratio/mean": 0.39007413387298584, "sampling/importance_sampling_ratio/min": 2.7292251616017893e-05, "sampling/sampling_logp_difference/max": 1.578157126903534, "sampling/sampling_logp_difference/mean": 0.005827636690810323, "step": 4640, "step_time": 7.082813928741961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.16666666666666666, "completions/max_length": 3000.0, "completions/max_terminated_length": 1412.3333333333333, "completions/mean_length": 902.9375, "completions/mean_terminated_length": 496.3073018391927, "completions/min_length": 195.33333333333334, "completions/min_terminated_length": 195.33333333333334, "entropy": 0.02491483520716429, "epoch": 0.5588942307692307, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.008433706127107143, "learning_rate": 4.4122596153846155e-07, "loss": -0.0038, "num_tokens": 98807234.0, "reward": 0.8014280398686727, "reward_std": 0.20420246322949728, "rewards/reward_fn/mean": 0.8014280398686727, "rewards/reward_fn/std": 0.2042024532953898, "sampling/importance_sampling_ratio/max": 1.695973793665568, "sampling/importance_sampling_ratio/mean": 0.37267933289210003, "sampling/importance_sampling_ratio/min": 3.326495334476931e-05, "sampling/sampling_logp_difference/max": 1.5972707271575928, "sampling/sampling_logp_difference/mean": 0.005354243330657482, "step": 4650, "step_time": 9.706643173657358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2478.0, "completions/mean_length": 1656.828125, "completions/mean_terminated_length": 575.8251953125, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.02190907336771488, "epoch": 0.5600961538461539, "frac_reward_zero_std": 0.125, "grad_norm": 0.0012037811102345586, "learning_rate": 4.4002403846153843e-07, "loss": 0.0002, "num_tokens": 98989983.0, "reward": 0.5517865717411041, "reward_std": 0.3251041769981384, "rewards/reward_fn/mean": 0.5517865717411041, "rewards/reward_fn/std": 0.3251041769981384, "sampling/importance_sampling_ratio/max": 1.0486707091331482, "sampling/importance_sampling_ratio/mean": 0.2713169679045677, "sampling/importance_sampling_ratio/min": 8.956128654702411e-06, "sampling/sampling_logp_difference/max": 3.4594154953956604, "sampling/sampling_logp_difference/mean": 0.004618691746145487, "step": 4660, "step_time": 6.885081348475069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.23958333333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 1526.0, "completions/mean_length": 1083.5, "completions/mean_terminated_length": 509.586181640625, "completions/min_length": 167.66666666666666, "completions/min_terminated_length": 167.66666666666666, "entropy": 0.02258847001940012, "epoch": 0.5612980769230769, "frac_reward_zero_std": 0.16666666666666666, "grad_norm": 0.021574240177869797, "learning_rate": 4.388221153846154e-07, "loss": -0.0014, "num_tokens": 99194703.0, "reward": 0.7579259276390076, "reward_std": 0.2322123795747757, "rewards/reward_fn/mean": 0.7579259276390076, "rewards/reward_fn/std": 0.2322123795747757, "sampling/importance_sampling_ratio/max": 1.2181958357493083, "sampling/importance_sampling_ratio/mean": 0.35997020204861957, "sampling/importance_sampling_ratio/min": 0.0003001053058445298, "sampling/sampling_logp_difference/max": 1.511852463086446, "sampling/sampling_logp_difference/mean": 0.004768910196920236, "step": 4670, "step_time": 10.214167202915997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 3000.0, "completions/max_terminated_length": 2573.5, "completions/mean_length": 1860.546875, "completions/mean_terminated_length": 925.3774719238281, "completions/min_length": 151.5, "completions/min_terminated_length": 151.5, "entropy": 0.01761984657496214, "epoch": 0.5625, "frac_reward_zero_std": 0.0, "grad_norm": 0.006100587081164122, "learning_rate": 4.376201923076923e-07, "loss": -0.0015, "num_tokens": 99392786.0, "reward": 0.4349208027124405, "reward_std": 0.3683205842971802, "rewards/reward_fn/mean": 0.4349208027124405, "rewards/reward_fn/std": 0.3683205842971802, "sampling/importance_sampling_ratio/max": 1.0922927856445312, "sampling/importance_sampling_ratio/mean": 0.25760699808597565, "sampling/importance_sampling_ratio/min": 7.944936987769324e-05, "sampling/sampling_logp_difference/max": 2.963417887687683, "sampling/sampling_logp_difference/mean": 0.004144983598962426, "step": 4680, "step_time": 7.196709128003567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.13541666666666666, "completions/max_length": 3000.0, "completions/max_terminated_length": 2074.3333333333335, "completions/mean_length": 949.5520833333334, "completions/mean_terminated_length": 613.1397196451823, "completions/min_length": 122.33333333333333, "completions/min_terminated_length": 122.33333333333333, "entropy": 0.021866787504404784, "epoch": 0.5637019230769231, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.008067885413765907, "learning_rate": 4.364182692307692e-07, "loss": -0.0055, "num_tokens": 99584439.0, "reward": 0.7140432993570963, "reward_std": 0.26368758579095203, "rewards/reward_fn/mean": 0.7140432993570963, "rewards/reward_fn/std": 0.26368758579095203, "sampling/importance_sampling_ratio/max": 1.7174349625905354, "sampling/importance_sampling_ratio/mean": 0.4797653357187907, "sampling/importance_sampling_ratio/min": 5.26220131481144e-05, "sampling/sampling_logp_difference/max": 2.489067316055298, "sampling/sampling_logp_difference/mean": 0.0051809440677364664, "step": 4690, "step_time": 9.973191524948925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 2171.0, "completions/max_terminated_length": 1323.0, "completions/mean_length": 855.6875, "completions/mean_terminated_length": 364.4093780517578, "completions/min_length": 141.5, "completions/min_terminated_length": 141.5, "entropy": 0.022597620636224745, "epoch": 0.5649038461538461, "frac_reward_zero_std": 0.0, "grad_norm": 0.033470235764980316, "learning_rate": 4.352163461538461e-07, "loss": -0.0282, "num_tokens": 99709299.0, "reward": 0.778680145740509, "reward_std": 0.21637780964374542, "rewards/reward_fn/mean": 0.778680145740509, "rewards/reward_fn/std": 0.21637779474258423, "sampling/importance_sampling_ratio/max": 1.2707395553588867, "sampling/importance_sampling_ratio/mean": 0.5148799270391464, "sampling/importance_sampling_ratio/min": 0.003217014695110265, "sampling/sampling_logp_difference/max": 1.7364155054092407, "sampling/sampling_logp_difference/mean": 0.005471408134326339, "step": 4700, "step_time": 5.0825703330338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.22916666666666666, "completions/max_length": 3000.0, "completions/max_terminated_length": 1838.6666666666667, "completions/mean_length": 1074.0833333333333, "completions/mean_terminated_length": 499.4611104329427, "completions/min_length": 153.33333333333334, "completions/min_terminated_length": 153.33333333333334, "entropy": 0.02644693087786436, "epoch": 0.5661057692307693, "frac_reward_zero_std": 0.0, "grad_norm": 0.006397308316081762, "learning_rate": 4.3401442307692304e-07, "loss": -0.0037, "num_tokens": 99915395.0, "reward": 0.7415772279103597, "reward_std": 0.24836899836858115, "rewards/reward_fn/mean": 0.7415772279103597, "rewards/reward_fn/std": 0.24836898346741995, "sampling/importance_sampling_ratio/max": 1.512966513633728, "sampling/importance_sampling_ratio/mean": 0.3117244839668274, "sampling/importance_sampling_ratio/min": 7.94431868295457e-05, "sampling/sampling_logp_difference/max": 1.8071233034133911, "sampling/sampling_logp_difference/mean": 0.005824047451217969, "step": 4710, "step_time": 10.386933123134076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2853.0, "completions/mean_length": 1356.03125, "completions/mean_terminated_length": 881.0750122070312, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.025592724233865737, "epoch": 0.5673076923076923, "frac_reward_zero_std": 0.125, "grad_norm": 0.021197061985731125, "learning_rate": 4.328125e-07, "loss": -0.0023, "num_tokens": 100080149.0, "reward": 0.6454020738601685, "reward_std": 0.33575864136219025, "rewards/reward_fn/mean": 0.6454020738601685, "rewards/reward_fn/std": 0.33575865626335144, "sampling/importance_sampling_ratio/max": 0.8036024421453476, "sampling/importance_sampling_ratio/mean": 0.24692358821630478, "sampling/importance_sampling_ratio/min": 5.680552021658514e-06, "sampling/sampling_logp_difference/max": 1.5822123289108276, "sampling/sampling_logp_difference/mean": 0.005205315304920077, "step": 4720, "step_time": 7.0919765719212595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.23958333333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 2243.6666666666665, "completions/mean_length": 1158.6875, "completions/mean_terminated_length": 584.9291788736979, "completions/min_length": 147.66666666666666, "completions/min_terminated_length": 147.66666666666666, "entropy": 0.024007597006857395, "epoch": 0.5685096153846154, "frac_reward_zero_std": 0.0, "grad_norm": 0.00816371664404869, "learning_rate": 4.316105769230769e-07, "loss": 0.0062, "num_tokens": 100309711.0, "reward": 0.7710356116294861, "reward_std": 0.22740711271762848, "rewards/reward_fn/mean": 0.7710356116294861, "rewards/reward_fn/std": 0.22740711271762848, "sampling/importance_sampling_ratio/max": 1.2891841729482014, "sampling/importance_sampling_ratio/mean": 0.32138242324193317, "sampling/importance_sampling_ratio/min": 0.0008709802829495553, "sampling/sampling_logp_difference/max": 1.7392321825027466, "sampling/sampling_logp_difference/mean": 0.0055523936947186785, "step": 4730, "step_time": 10.097063265461475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 3000.0, "completions/max_terminated_length": 1803.0, "completions/mean_length": 792.0625, "completions/mean_terminated_length": 427.09129333496094, "completions/min_length": 131.5, "completions/min_terminated_length": 131.5, "entropy": 0.02493090704083443, "epoch": 0.5697115384615384, "frac_reward_zero_std": 0.125, "grad_norm": 0.01811068318784237, "learning_rate": 4.304086538461538e-07, "loss": -0.0057, "num_tokens": 100434731.0, "reward": 0.7794182300567627, "reward_std": 0.24762509763240814, "rewards/reward_fn/mean": 0.7794182300567627, "rewards/reward_fn/std": 0.24762509763240814, "sampling/importance_sampling_ratio/max": 2.31171452999115, "sampling/importance_sampling_ratio/mean": 0.5325972884893417, "sampling/importance_sampling_ratio/min": 0.00013172200306144077, "sampling/sampling_logp_difference/max": 1.3844528198242188, "sampling/sampling_logp_difference/mean": 0.005583083489909768, "step": 4740, "step_time": 6.940674606058747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.23958333333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 1735.3333333333333, "completions/mean_length": 1086.9791666666667, "completions/mean_terminated_length": 476.60675048828125, "completions/min_length": 133.33333333333334, "completions/min_terminated_length": 133.33333333333334, "entropy": 0.024032506067305802, "epoch": 0.5709134615384616, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.005081892013549805, "learning_rate": 4.2920673076923076e-07, "loss": 0.0021, "num_tokens": 100656441.0, "reward": 0.7682750423749288, "reward_std": 0.20916451017061868, "rewards/reward_fn/mean": 0.7682750423749288, "rewards/reward_fn/std": 0.20916450520356497, "sampling/importance_sampling_ratio/max": 1.5220094919204712, "sampling/importance_sampling_ratio/mean": 0.3698798020680745, "sampling/importance_sampling_ratio/min": 5.069670900563021e-06, "sampling/sampling_logp_difference/max": 1.9065341154734294, "sampling/sampling_logp_difference/mean": 0.005420365060369174, "step": 4750, "step_time": 10.203029814083129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 3000.0, "completions/max_terminated_length": 2098.0, "completions/mean_length": 1074.375, "completions/mean_terminated_length": 553.4415893554688, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.021880332566797733, "epoch": 0.5721153846153846, "frac_reward_zero_std": 0.0, "grad_norm": 0.003652968443930149, "learning_rate": 4.280048076923077e-07, "loss": 0.0059, "num_tokens": 100784177.0, "reward": 0.7405271232128143, "reward_std": 0.23148753494024277, "rewards/reward_fn/mean": 0.7405271232128143, "rewards/reward_fn/std": 0.23148751258850098, "sampling/importance_sampling_ratio/max": 1.2198252081871033, "sampling/importance_sampling_ratio/mean": 0.3918258994817734, "sampling/importance_sampling_ratio/min": 0.00021933351490588393, "sampling/sampling_logp_difference/max": 2.0627450942993164, "sampling/sampling_logp_difference/mean": 0.0046733415219932795, "step": 4760, "step_time": 7.002548469882458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 3000.0, "completions/max_terminated_length": 1701.0, "completions/mean_length": 810.3958333333334, "completions/mean_terminated_length": 406.7034098307292, "completions/min_length": 123.33333333333333, "completions/min_terminated_length": 123.33333333333333, "entropy": 0.021209524758160115, "epoch": 0.5733173076923077, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.013312410563230515, "learning_rate": 4.268028846153846e-07, "loss": -0.0112, "num_tokens": 100963863.0, "reward": 0.8092706402142843, "reward_std": 0.1906796395778656, "rewards/reward_fn/mean": 0.8092706402142843, "rewards/reward_fn/std": 0.19067964454491934, "sampling/importance_sampling_ratio/max": 1.6612978378931682, "sampling/importance_sampling_ratio/mean": 0.5412750740845998, "sampling/importance_sampling_ratio/min": 0.0001579450294192005, "sampling/sampling_logp_difference/max": 2.0215343634287515, "sampling/sampling_logp_difference/mean": 0.0049331521925826865, "step": 4770, "step_time": 10.068143427558244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 3000.0, "completions/max_terminated_length": 1990.0, "completions/mean_length": 820.890625, "completions/mean_terminated_length": 462.00132751464844, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.024867743253707886, "epoch": 0.5745192307692307, "frac_reward_zero_std": 0.0, "grad_norm": 0.01276171114295721, "learning_rate": 4.2560096153846154e-07, "loss": 0.0008, "num_tokens": 101082056.0, "reward": 0.7997393608093262, "reward_std": 0.21116505563259125, "rewards/reward_fn/mean": 0.7997393608093262, "rewards/reward_fn/std": 0.21116505563259125, "sampling/importance_sampling_ratio/max": 1.2203878164291382, "sampling/importance_sampling_ratio/mean": 0.4335853308439255, "sampling/importance_sampling_ratio/min": 0.0003147758407067158, "sampling/sampling_logp_difference/max": 2.239464521408081, "sampling/sampling_logp_difference/mean": 0.0056192693300545216, "step": 4780, "step_time": 6.716025569476187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.19791666666666666, "completions/max_length": 3000.0, "completions/max_terminated_length": 2086.3333333333335, "completions/mean_length": 998.1041666666666, "completions/mean_terminated_length": 495.4107666015625, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.021574134845286606, "epoch": 0.5757211538461539, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.0223807692527771, "learning_rate": 4.243990384615385e-07, "loss": 0.0007, "num_tokens": 101294530.0, "reward": 0.6846240560213724, "reward_std": 0.27560468514760333, "rewards/reward_fn/mean": 0.6846240560213724, "rewards/reward_fn/std": 0.27560467024644214, "sampling/importance_sampling_ratio/max": 1.6658755540847778, "sampling/importance_sampling_ratio/mean": 0.4562239646911621, "sampling/importance_sampling_ratio/min": 0.0009703209810443999, "sampling/sampling_logp_difference/max": 1.9610299666722615, "sampling/sampling_logp_difference/mean": 0.005253803605834643, "step": 4790, "step_time": 10.220233269315212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 3000.0, "completions/max_terminated_length": 1843.0, "completions/mean_length": 1224.53125, "completions/mean_terminated_length": 547.2261199951172, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.024056249111890794, "epoch": 0.5769230769230769, "frac_reward_zero_std": 0.0, "grad_norm": 0.011955502443015575, "learning_rate": 4.2319711538461537e-07, "loss": 0.0009, "num_tokens": 101449572.0, "reward": 0.7300267517566681, "reward_std": 0.25107288360595703, "rewards/reward_fn/mean": 0.7300267517566681, "rewards/reward_fn/std": 0.25107286870479584, "sampling/importance_sampling_ratio/max": 1.4083142578601837, "sampling/importance_sampling_ratio/mean": 0.3560352623462677, "sampling/importance_sampling_ratio/min": 7.98562996351393e-06, "sampling/sampling_logp_difference/max": 2.700586259365082, "sampling/sampling_logp_difference/mean": 0.004852702375501394, "step": 4800, "step_time": 7.081853283755481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.16666666666666666, "completions/max_length": 3000.0, "completions/max_terminated_length": 2389.3333333333335, "completions/mean_length": 961.4479166666666, "completions/mean_terminated_length": 556.6527303059896, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.027795279584825038, "epoch": 0.578125, "frac_reward_zero_std": 0.0, "grad_norm": 0.004317061509937048, "learning_rate": 4.219951923076923e-07, "loss": -0.0037, "num_tokens": 101646775.0, "reward": 0.8171068827311198, "reward_std": 0.21196740369002023, "rewards/reward_fn/mean": 0.8171068827311198, "rewards/reward_fn/std": 0.21196739872296652, "sampling/importance_sampling_ratio/max": 1.4399408102035522, "sampling/importance_sampling_ratio/mean": 0.37520233790079754, "sampling/importance_sampling_ratio/min": 0.00027061443203516927, "sampling/sampling_logp_difference/max": 1.3466697533925374, "sampling/sampling_logp_difference/mean": 0.0053937264407674474, "step": 4810, "step_time": 10.344665996450932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2590.5, "completions/mean_length": 1186.96875, "completions/mean_terminated_length": 636.3662414550781, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.02490924932062626, "epoch": 0.5793269230769231, "frac_reward_zero_std": 0.0, "grad_norm": 0.026855913922190666, "learning_rate": 4.207932692307692e-07, "loss": 0.0009, "num_tokens": 101789765.0, "reward": 0.7765503823757172, "reward_std": 0.2380327582359314, "rewards/reward_fn/mean": 0.7765503823757172, "rewards/reward_fn/std": 0.2380327731370926, "sampling/importance_sampling_ratio/max": 1.1102233529090881, "sampling/importance_sampling_ratio/mean": 0.3227231353521347, "sampling/importance_sampling_ratio/min": 0.0002473349559295457, "sampling/sampling_logp_difference/max": 2.876979112625122, "sampling/sampling_logp_difference/mean": 0.005088199395686388, "step": 4820, "step_time": 6.908480853587389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.10416666666666667, "completions/max_length": 2924.3333333333335, "completions/max_terminated_length": 1906.0, "completions/mean_length": 675.09375, "completions/mean_terminated_length": 412.4629211425781, "completions/min_length": 114.66666666666667, "completions/min_terminated_length": 114.66666666666667, "entropy": 0.02265103217214346, "epoch": 0.5805288461538461, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.014652831479907036, "learning_rate": 4.195913461538461e-07, "loss": 0.0101, "num_tokens": 101952150.0, "reward": 0.7546736200650533, "reward_std": 0.2916797995567322, "rewards/reward_fn/mean": 0.7546736200650533, "rewards/reward_fn/std": 0.2916797995567322, "sampling/importance_sampling_ratio/max": 1.3853487968444824, "sampling/importance_sampling_ratio/mean": 0.5314125418663025, "sampling/importance_sampling_ratio/min": 0.003113578694562117, "sampling/sampling_logp_difference/max": 1.7362712621688843, "sampling/sampling_logp_difference/mean": 0.005054396266738574, "step": 4830, "step_time": 9.6338730905205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2141.0, "completions/mean_length": 709.015625, "completions/mean_terminated_length": 381.73216247558594, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.020984235033392907, "epoch": 0.5817307692307693, "frac_reward_zero_std": 0.125, "grad_norm": 0.030850177630782127, "learning_rate": 4.1838942307692303e-07, "loss": -0.0053, "num_tokens": 102070311.0, "reward": 0.7096692323684692, "reward_std": 0.3068932667374611, "rewards/reward_fn/mean": 0.7096692323684692, "rewards/reward_fn/std": 0.3068932592868805, "sampling/importance_sampling_ratio/max": 1.803731381893158, "sampling/importance_sampling_ratio/mean": 0.5935621857643127, "sampling/importance_sampling_ratio/min": 4.069026954311994e-05, "sampling/sampling_logp_difference/max": 3.7934130430221558, "sampling/sampling_logp_difference/mean": 0.004612303571775556, "step": 4840, "step_time": 6.868545278999955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2604166666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 2403.6666666666665, "completions/mean_length": 1221.21875, "completions/mean_terminated_length": 615.2819010416666, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.022916336357593537, "epoch": 0.5829326923076923, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.001990628894418478, "learning_rate": 4.171875e-07, "loss": -0.0061, "num_tokens": 102288108.0, "reward": 0.6915499170621237, "reward_std": 0.2752617796262105, "rewards/reward_fn/mean": 0.6915499170621237, "rewards/reward_fn/std": 0.2752617696921031, "sampling/importance_sampling_ratio/max": 1.4303771654764812, "sampling/importance_sampling_ratio/mean": 0.3527977168560028, "sampling/importance_sampling_ratio/min": 9.482967349564812e-05, "sampling/sampling_logp_difference/max": 5.12975811958313, "sampling/sampling_logp_difference/mean": 0.004840802401304245, "step": 4850, "step_time": 10.233467449434102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 3000.0, "completions/max_terminated_length": 2655.5, "completions/mean_length": 896.984375, "completions/mean_terminated_length": 461.02565002441406, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.019719032011926173, "epoch": 0.5841346153846154, "frac_reward_zero_std": 0.0, "grad_norm": 0.02250264212489128, "learning_rate": 4.159855769230769e-07, "loss": 0.0034, "num_tokens": 102424019.0, "reward": 0.6517940759658813, "reward_std": 0.3442230373620987, "rewards/reward_fn/mean": 0.6517940759658813, "rewards/reward_fn/std": 0.3442230373620987, "sampling/importance_sampling_ratio/max": 1.7070155143737793, "sampling/importance_sampling_ratio/mean": 0.4852282404899597, "sampling/importance_sampling_ratio/min": 1.5565907688142033e-05, "sampling/sampling_logp_difference/max": 1.4993638396263123, "sampling/sampling_logp_difference/mean": 0.004475282970815897, "step": 4860, "step_time": 6.937070699967444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5416666666666666, "completions/max_length": 3000.0, "completions/max_terminated_length": 2437.6666666666665, "completions/mean_length": 1913.9375, "completions/mean_terminated_length": 630.1375935872396, "completions/min_length": 154.33333333333334, "completions/min_terminated_length": 154.33333333333334, "entropy": 0.019445449486374854, "epoch": 0.5853365384615384, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.006359618157148361, "learning_rate": 4.147836538461538e-07, "loss": -0.001, "num_tokens": 102729373.0, "reward": 0.5586279531319936, "reward_std": 0.2819663683573405, "rewards/reward_fn/mean": 0.5586279531319936, "rewards/reward_fn/std": 0.2819663683573405, "sampling/importance_sampling_ratio/max": 1.4365261991818745, "sampling/importance_sampling_ratio/mean": 0.26463093360265094, "sampling/importance_sampling_ratio/min": 4.952289327775361e-05, "sampling/sampling_logp_difference/max": 2.369912584622701, "sampling/sampling_logp_difference/mean": 0.0041012804334362345, "step": 4870, "step_time": 10.526767113059758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 3000.0, "completions/max_terminated_length": 2260.5, "completions/mean_length": 779.65625, "completions/mean_terminated_length": 369.9684143066406, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.019110479019582273, "epoch": 0.5865384615384616, "frac_reward_zero_std": 0.125, "grad_norm": 0.027921130880713463, "learning_rate": 4.1358173076923075e-07, "loss": -0.011, "num_tokens": 102838343.0, "reward": 0.8026587069034576, "reward_std": 0.21886324137449265, "rewards/reward_fn/mean": 0.8026587069034576, "rewards/reward_fn/std": 0.21886324137449265, "sampling/importance_sampling_ratio/max": 1.5789586901664734, "sampling/importance_sampling_ratio/mean": 0.577175498008728, "sampling/importance_sampling_ratio/min": 6.49341382086277e-05, "sampling/sampling_logp_difference/max": 1.6902188658714294, "sampling/sampling_logp_difference/mean": 0.004087470006197691, "step": 4880, "step_time": 6.893620060477406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2604166666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 1750.3333333333333, "completions/mean_length": 1160.6979166666667, "completions/mean_terminated_length": 499.14039103190106, "completions/min_length": 141.33333333333334, "completions/min_terminated_length": 141.33333333333334, "entropy": 0.023764509707689285, "epoch": 0.5877403846153846, "frac_reward_zero_std": 0.0, "grad_norm": 0.006115597672760487, "learning_rate": 4.123798076923077e-07, "loss": 0.0035, "num_tokens": 103062186.0, "reward": 0.7276701132456461, "reward_std": 0.26079527537027997, "rewards/reward_fn/mean": 0.7276701132456461, "rewards/reward_fn/std": 0.2607952704032262, "sampling/importance_sampling_ratio/max": 1.458106239636739, "sampling/importance_sampling_ratio/mean": 0.38128988941510517, "sampling/importance_sampling_ratio/min": 0.0002890738995802167, "sampling/sampling_logp_difference/max": 5.122668902079265, "sampling/sampling_logp_difference/mean": 0.004752009486158689, "step": 4890, "step_time": 10.270922887604684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 3000.0, "completions/max_terminated_length": 1802.0, "completions/mean_length": 924.515625, "completions/mean_terminated_length": 453.84092712402344, "completions/min_length": 90.5, "completions/min_terminated_length": 90.5, "entropy": 0.0211644833907485, "epoch": 0.5889423076923077, "frac_reward_zero_std": 0.125, "grad_norm": 0.02123301289975643, "learning_rate": 4.111778846153846e-07, "loss": -0.0064, "num_tokens": 103186195.0, "reward": 0.5620559751987457, "reward_std": 0.3786896616220474, "rewards/reward_fn/mean": 0.5620559751987457, "rewards/reward_fn/std": 0.37868964672088623, "sampling/importance_sampling_ratio/max": 1.2570249140262604, "sampling/importance_sampling_ratio/mean": 0.4666053056716919, "sampling/importance_sampling_ratio/min": 0.00016854002751642838, "sampling/sampling_logp_difference/max": 1.6039137244224548, "sampling/sampling_logp_difference/mean": 0.004408363485708833, "step": 4900, "step_time": 6.8224838458001615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.20833333333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 2210.6666666666665, "completions/mean_length": 1034.625, "completions/mean_terminated_length": 509.3207702636719, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.022298750653862952, "epoch": 0.5901442307692307, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.012239735573530197, "learning_rate": 4.0997596153846153e-07, "loss": 0.0192, "num_tokens": 103406663.0, "reward": 0.6743758320808411, "reward_std": 0.3195442060629527, "rewards/reward_fn/mean": 0.6743758320808411, "rewards/reward_fn/std": 0.3195441961288452, "sampling/importance_sampling_ratio/max": 1.5196356773376465, "sampling/importance_sampling_ratio/mean": 0.4678070346514384, "sampling/importance_sampling_ratio/min": 0.00011239923454316643, "sampling/sampling_logp_difference/max": 2.1601450443267822, "sampling/sampling_logp_difference/mean": 0.004525871792187293, "step": 4910, "step_time": 10.208277633413672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 3000.0, "completions/max_terminated_length": 2625.0, "completions/mean_length": 1131.53125, "completions/mean_terminated_length": 612.2452087402344, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.02164719272404909, "epoch": 0.5913461538461539, "frac_reward_zero_std": 0.25, "grad_norm": 0.0027049037162214518, "learning_rate": 4.0877403846153847e-07, "loss": -0.0072, "num_tokens": 103560169.0, "reward": 0.6572767794132233, "reward_std": 0.2858353778719902, "rewards/reward_fn/mean": 0.6572767794132233, "rewards/reward_fn/std": 0.2858353704214096, "sampling/importance_sampling_ratio/max": 1.8628088235855103, "sampling/importance_sampling_ratio/mean": 0.39583373069763184, "sampling/importance_sampling_ratio/min": 1.696332219580654e-06, "sampling/sampling_logp_difference/max": 2.9159887433052063, "sampling/sampling_logp_difference/mean": 0.0048941136337816715, "step": 4920, "step_time": 7.142695978935808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.17708333333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 1924.0, "completions/mean_length": 937.9791666666666, "completions/mean_terminated_length": 494.20653279622394, "completions/min_length": 134.33333333333334, "completions/min_terminated_length": 134.33333333333334, "entropy": 0.02831564601510763, "epoch": 0.5925480769230769, "frac_reward_zero_std": 0.0, "grad_norm": 0.012872512452304363, "learning_rate": 4.075721153846154e-07, "loss": -0.013, "num_tokens": 103757191.0, "reward": 0.7709330519040426, "reward_std": 0.25404555598894757, "rewards/reward_fn/mean": 0.7709330519040426, "rewards/reward_fn/std": 0.2540455460548401, "sampling/importance_sampling_ratio/max": 1.3768915732701619, "sampling/importance_sampling_ratio/mean": 0.38867974281311035, "sampling/importance_sampling_ratio/min": 0.00038489247526740655, "sampling/sampling_logp_difference/max": 2.479659159978231, "sampling/sampling_logp_difference/mean": 0.005906573496758938, "step": 4930, "step_time": 9.991337131988256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 3000.0, "completions/max_terminated_length": 1815.5, "completions/mean_length": 685.34375, "completions/mean_terminated_length": 310.7799987792969, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.02254042010754347, "epoch": 0.59375, "frac_reward_zero_std": 0.125, "grad_norm": 0.012193321250379086, "learning_rate": 4.063701923076923e-07, "loss": -0.0091, "num_tokens": 103869317.0, "reward": 0.7115818560123444, "reward_std": 0.31530239433050156, "rewards/reward_fn/mean": 0.7115818560123444, "rewards/reward_fn/std": 0.31530238687992096, "sampling/importance_sampling_ratio/max": 1.3543621599674225, "sampling/importance_sampling_ratio/mean": 0.6017886698246002, "sampling/importance_sampling_ratio/min": 0.0007288661145139486, "sampling/sampling_logp_difference/max": 1.594328761100769, "sampling/sampling_logp_difference/mean": 0.004249157500453293, "step": 4940, "step_time": 6.707702626101673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3000.0, "completions/max_terminated_length": 1810.3333333333333, "completions/mean_length": 1130.7291666666667, "completions/mean_terminated_length": 509.14247639973956, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.02684724573045969, "epoch": 0.5949519230769231, "frac_reward_zero_std": 0.0, "grad_norm": 0.01685972884297371, "learning_rate": 4.051682692307692e-07, "loss": -0.012, "num_tokens": 104091939.0, "reward": 0.7657289107640585, "reward_std": 0.2388941248257955, "rewards/reward_fn/mean": 0.7657289107640585, "rewards/reward_fn/std": 0.2388941099246343, "sampling/importance_sampling_ratio/max": 1.4688480099042256, "sampling/importance_sampling_ratio/mean": 0.3432903438806534, "sampling/importance_sampling_ratio/min": 0.0008778878261762138, "sampling/sampling_logp_difference/max": 1.8842936356862385, "sampling/sampling_logp_difference/mean": 0.0058411406353116035, "step": 4950, "step_time": 10.400688770972192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2229.0, "completions/mean_length": 1166.796875, "completions/mean_terminated_length": 592.6809844970703, "completions/min_length": 165.5, "completions/min_terminated_length": 165.5, "entropy": 0.023632480949163436, "epoch": 0.5961538461538461, "frac_reward_zero_std": 0.125, "grad_norm": 0.009805792942643166, "learning_rate": 4.0396634615384613e-07, "loss": -0.0025, "num_tokens": 104246486.0, "reward": 0.7761298716068268, "reward_std": 0.24034328013658524, "rewards/reward_fn/mean": 0.7761298716068268, "rewards/reward_fn/std": 0.24034328013658524, "sampling/importance_sampling_ratio/max": 1.8972207903862, "sampling/importance_sampling_ratio/mean": 0.38975587487220764, "sampling/importance_sampling_ratio/min": 0.00011053400703531224, "sampling/sampling_logp_difference/max": 2.9088631868362427, "sampling/sampling_logp_difference/mean": 0.005480707623064518, "step": 4960, "step_time": 7.081577229313552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2372.6666666666665, "completions/mean_length": 1253.5104166666667, "completions/mean_terminated_length": 615.9616088867188, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.02284114295616746, "epoch": 0.5973557692307693, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.00528599601238966, "learning_rate": 4.02764423076923e-07, "loss": 0.0012, "num_tokens": 104524183.0, "reward": 0.6910169919331869, "reward_std": 0.32408665617307025, "rewards/reward_fn/mean": 0.6910169919331869, "rewards/reward_fn/std": 0.32408665617307025, "sampling/importance_sampling_ratio/max": 1.263754626115163, "sampling/importance_sampling_ratio/mean": 0.36773579319318134, "sampling/importance_sampling_ratio/min": 0.00013132059393683448, "sampling/sampling_logp_difference/max": 2.149644136428833, "sampling/sampling_logp_difference/mean": 0.005263534219314654, "step": 4970, "step_time": 10.993262417241931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 3000.0, "completions/max_terminated_length": 1950.5, "completions/mean_length": 689.984375, "completions/mean_terminated_length": 497.2827606201172, "completions/min_length": 132.5, "completions/min_terminated_length": 132.5, "entropy": 0.027624637261033057, "epoch": 0.5985576923076923, "frac_reward_zero_std": 0.0, "grad_norm": 0.008957736194133759, "learning_rate": 4.0156249999999997e-07, "loss": -0.0027, "num_tokens": 104629238.0, "reward": 0.8578894734382629, "reward_std": 0.15532603114843369, "rewards/reward_fn/mean": 0.8578894734382629, "rewards/reward_fn/std": 0.15532603859901428, "sampling/importance_sampling_ratio/max": 1.296023666858673, "sampling/importance_sampling_ratio/mean": 0.39213769137859344, "sampling/importance_sampling_ratio/min": 4.8655803311703494e-05, "sampling/sampling_logp_difference/max": 1.6199893355369568, "sampling/sampling_logp_difference/mean": 0.006319256965070963, "step": 4980, "step_time": 6.692264613136649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 3000.0, "completions/max_terminated_length": 1729.6666666666667, "completions/mean_length": 998.2916666666666, "completions/mean_terminated_length": 443.55556233723956, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.025915177166461946, "epoch": 0.5997596153846154, "frac_reward_zero_std": 0.0, "grad_norm": 0.0053750379011034966, "learning_rate": 4.003605769230769e-07, "loss": -0.0068, "num_tokens": 104834146.0, "reward": 0.7457575798034668, "reward_std": 0.2818837563196818, "rewards/reward_fn/mean": 0.7457575798034668, "rewards/reward_fn/std": 0.28188374141852063, "sampling/importance_sampling_ratio/max": 1.493797739346822, "sampling/importance_sampling_ratio/mean": 0.3781926433245341, "sampling/importance_sampling_ratio/min": 3.813539979091729e-05, "sampling/sampling_logp_difference/max": 2.3447208404541016, "sampling/sampling_logp_difference/mean": 0.00594593600059549, "step": 4990, "step_time": 9.954669601749629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 3000.0, "completions/max_terminated_length": 1175.5, "completions/mean_length": 634.015625, "completions/mean_terminated_length": 294.4833984375, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.02125542489811778, "epoch": 0.6009615384615384, "frac_reward_zero_std": 0.25, "grad_norm": 0.02505166083574295, "learning_rate": 3.991586538461538e-07, "loss": -0.0036, "num_tokens": 104950739.0, "reward": 0.6197971701622009, "reward_std": 0.30241070687770844, "rewards/reward_fn/mean": 0.6197971701622009, "rewards/reward_fn/std": 0.30241069197654724, "sampling/importance_sampling_ratio/max": 1.2878140211105347, "sampling/importance_sampling_ratio/mean": 0.5813890397548676, "sampling/importance_sampling_ratio/min": 0.002387404936598614, "sampling/sampling_logp_difference/max": 2.017430543899536, "sampling/sampling_logp_difference/mean": 0.004083445528522134, "step": 5000, "step_time": 6.829733473807574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 3000.0, "completions/max_terminated_length": 2046.6666666666667, "completions/mean_length": 995.7291666666666, "completions/mean_terminated_length": 527.3403727213541, "completions/min_length": 147.33333333333334, "completions/min_terminated_length": 147.33333333333334, "entropy": 0.022522216476500035, "epoch": 0.6021634615384616, "frac_reward_zero_std": 0.0, "grad_norm": 0.012101664207875729, "learning_rate": 3.9795673076923074e-07, "loss": -0.0117, "num_tokens": 105175865.0, "reward": 0.7437825997670492, "reward_std": 0.2951398392518361, "rewards/reward_fn/mean": 0.7437825997670492, "rewards/reward_fn/std": 0.2951398491859436, "sampling/importance_sampling_ratio/max": 1.7360821564992268, "sampling/importance_sampling_ratio/mean": 0.46573416392008465, "sampling/importance_sampling_ratio/min": 1.866499965785806e-05, "sampling/sampling_logp_difference/max": 5.767338037490845, "sampling/sampling_logp_difference/mean": 0.00572083347166578, "step": 5010, "step_time": 10.194596168305726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 3000.0, "completions/max_terminated_length": 2110.0, "completions/mean_length": 969.015625, "completions/mean_terminated_length": 550.8650054931641, "completions/min_length": 148.5, "completions/min_terminated_length": 148.5, "entropy": 0.02747969385236502, "epoch": 0.6033653846153846, "frac_reward_zero_std": 0.0, "grad_norm": 0.018577666953206062, "learning_rate": 3.967548076923077e-07, "loss": 0.0129, "num_tokens": 105310682.0, "reward": 0.8321851491928101, "reward_std": 0.1905403956770897, "rewards/reward_fn/mean": 0.8321851491928101, "rewards/reward_fn/std": 0.1905403956770897, "sampling/importance_sampling_ratio/max": 1.3066293597221375, "sampling/importance_sampling_ratio/mean": 0.3421141654253006, "sampling/importance_sampling_ratio/min": 0.00032554548079133383, "sampling/sampling_logp_difference/max": 1.8156324625015259, "sampling/sampling_logp_difference/mean": 0.00650100689381361, "step": 5020, "step_time": 6.969020915217698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3854166666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 2275.3333333333335, "completions/mean_length": 1605.9375, "completions/mean_terminated_length": 729.3140258789062, "completions/min_length": 159.33333333333334, "completions/min_terminated_length": 159.33333333333334, "entropy": 0.022434124536812306, "epoch": 0.6045673076923077, "frac_reward_zero_std": 0.0, "grad_norm": 0.004263255745172501, "learning_rate": 3.9555288461538463e-07, "loss": -0.0075, "num_tokens": 105583892.0, "reward": 0.6616113583246866, "reward_std": 0.27677159508069354, "rewards/reward_fn/mean": 0.6616113583246866, "rewards/reward_fn/std": 0.276771605014801, "sampling/importance_sampling_ratio/max": 1.6937284866968791, "sampling/importance_sampling_ratio/mean": 0.28058283527692157, "sampling/importance_sampling_ratio/min": 5.2624907008672985e-05, "sampling/sampling_logp_difference/max": 3.35296360651652, "sampling/sampling_logp_difference/mean": 0.004805552773177624, "step": 5030, "step_time": 10.343037638347596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 3000.0, "completions/max_terminated_length": 1903.0, "completions/mean_length": 1443.859375, "completions/mean_terminated_length": 519.5379028320312, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.024822285026311876, "epoch": 0.6057692307692307, "frac_reward_zero_std": 0.0, "grad_norm": 0.013118373230099678, "learning_rate": 3.943509615384615e-07, "loss": -0.0019, "num_tokens": 105758907.0, "reward": 0.7158427238464355, "reward_std": 0.23397590219974518, "rewards/reward_fn/mean": 0.7158427238464355, "rewards/reward_fn/std": 0.23397590965032578, "sampling/importance_sampling_ratio/max": 1.0760427713394165, "sampling/importance_sampling_ratio/mean": 0.2928331717848778, "sampling/importance_sampling_ratio/min": 9.827712960941426e-05, "sampling/sampling_logp_difference/max": 4.232422590255737, "sampling/sampling_logp_difference/mean": 0.005143709946423769, "step": 5040, "step_time": 7.230512062832713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.22916666666666666, "completions/max_length": 3000.0, "completions/max_terminated_length": 1926.6666666666667, "completions/mean_length": 1103.6145833333333, "completions/mean_terminated_length": 552.3997802734375, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.026300636120140553, "epoch": 0.6069711538461539, "frac_reward_zero_std": 0.0, "grad_norm": 0.03008558414876461, "learning_rate": 3.9314903846153846e-07, "loss": 0.0169, "num_tokens": 105967366.0, "reward": 0.7765279213587443, "reward_std": 0.23014473418394724, "rewards/reward_fn/mean": 0.7765279213587443, "rewards/reward_fn/std": 0.23014472424983978, "sampling/importance_sampling_ratio/max": 1.470889409383138, "sampling/importance_sampling_ratio/mean": 0.41898902753988904, "sampling/importance_sampling_ratio/min": 0.00013191444190852053, "sampling/sampling_logp_difference/max": 2.136867046356201, "sampling/sampling_logp_difference/mean": 0.005335469419757525, "step": 5050, "step_time": 10.298484700545668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1965.0, "completions/max_terminated_length": 1292.5, "completions/mean_length": 807.859375, "completions/mean_terminated_length": 427.72442626953125, "completions/min_length": 177.5, "completions/min_terminated_length": 177.5, "entropy": 0.020014599338173865, "epoch": 0.6081730769230769, "frac_reward_zero_std": 0.25, "grad_norm": 0.031883031129837036, "learning_rate": 3.919471153846154e-07, "loss": 0.0096, "num_tokens": 106083717.0, "reward": 0.5458946079015732, "reward_std": 0.3811759501695633, "rewards/reward_fn/mean": 0.5458946079015732, "rewards/reward_fn/std": 0.3811759501695633, "sampling/importance_sampling_ratio/max": 1.2363301515579224, "sampling/importance_sampling_ratio/mean": 0.4568900167942047, "sampling/importance_sampling_ratio/min": 0.0010497698955873602, "sampling/sampling_logp_difference/max": 3.8965442180633545, "sampling/sampling_logp_difference/mean": 0.005163348279893398, "step": 5060, "step_time": 4.623298030439765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 3000.0, "completions/max_terminated_length": 1886.3333333333333, "completions/mean_length": 1128.7291666666667, "completions/mean_terminated_length": 408.2699381510417, "completions/min_length": 106.33333333333333, "completions/min_terminated_length": 106.33333333333333, "entropy": 0.026266050525009632, "epoch": 0.609375, "frac_reward_zero_std": 0.0, "grad_norm": 0.008631826378405094, "learning_rate": 3.907451923076923e-07, "loss": -0.0085, "num_tokens": 106305347.0, "reward": 0.7632948358853658, "reward_std": 0.21390278140703836, "rewards/reward_fn/mean": 0.7632948358853658, "rewards/reward_fn/std": 0.21390277643998465, "sampling/importance_sampling_ratio/max": 1.4759195645650227, "sampling/importance_sampling_ratio/mean": 0.4003698031107585, "sampling/importance_sampling_ratio/min": 7.827130281630919e-05, "sampling/sampling_logp_difference/max": 3.2449154059092202, "sampling/sampling_logp_difference/mean": 0.005691803681353728, "step": 5070, "step_time": 10.280946888308971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2090.5, "completions/mean_length": 1074.734375, "completions/mean_terminated_length": 471.54168701171875, "completions/min_length": 106.5, "completions/min_terminated_length": 106.5, "entropy": 0.026087970845401286, "epoch": 0.6105769230769231, "frac_reward_zero_std": 0.0, "grad_norm": 0.014748647809028625, "learning_rate": 3.895432692307692e-07, "loss": -0.0011, "num_tokens": 106450754.0, "reward": 0.7549424469470978, "reward_std": 0.24799763411283493, "rewards/reward_fn/mean": 0.7549424469470978, "rewards/reward_fn/std": 0.24799762666225433, "sampling/importance_sampling_ratio/max": 1.0301072299480438, "sampling/importance_sampling_ratio/mean": 0.30902518332004547, "sampling/importance_sampling_ratio/min": 3.811893236616015e-07, "sampling/sampling_logp_difference/max": 3.819211006164551, "sampling/sampling_logp_difference/mean": 0.006213832646608353, "step": 5080, "step_time": 6.87015322977677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 3000.0, "completions/max_terminated_length": 1402.3333333333333, "completions/mean_length": 932.8333333333334, "completions/mean_terminated_length": 351.68878173828125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.023189817648380996, "epoch": 0.6117788461538461, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.015997102484107018, "learning_rate": 3.883413461538461e-07, "loss": 0.0032, "num_tokens": 106665690.0, "reward": 0.7057939966519674, "reward_std": 0.2519174963235855, "rewards/reward_fn/mean": 0.7057939966519674, "rewards/reward_fn/std": 0.2519174963235855, "sampling/importance_sampling_ratio/max": 1.2030082146326702, "sampling/importance_sampling_ratio/mean": 0.4721429447333018, "sampling/importance_sampling_ratio/min": 9.450651541579684e-05, "sampling/sampling_logp_difference/max": 2.03209125995636, "sampling/sampling_logp_difference/mean": 0.005240684452777107, "step": 5090, "step_time": 10.072462292481214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2090.5, "completions/mean_length": 1465.828125, "completions/mean_terminated_length": 545.3250274658203, "completions/min_length": 131.5, "completions/min_terminated_length": 131.5, "entropy": 0.02359977103769779, "epoch": 0.6129807692307693, "frac_reward_zero_std": 0.0, "grad_norm": 0.007085125893354416, "learning_rate": 3.8713942307692307e-07, "loss": -0.0049, "num_tokens": 106834807.0, "reward": 0.5726176500320435, "reward_std": 0.31728415191173553, "rewards/reward_fn/mean": 0.5726176500320435, "rewards/reward_fn/std": 0.31728416681289673, "sampling/importance_sampling_ratio/max": 1.7839205265045166, "sampling/importance_sampling_ratio/mean": 0.3086121678352356, "sampling/importance_sampling_ratio/min": 1.610582535249705e-05, "sampling/sampling_logp_difference/max": 2.0828499794006348, "sampling/sampling_logp_difference/mean": 0.005259329918771982, "step": 5100, "step_time": 7.163532029464841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 3000.0, "completions/max_terminated_length": 1891.3333333333333, "completions/mean_length": 778.125, "completions/mean_terminated_length": 458.4779357910156, "completions/min_length": 114.33333333333333, "completions/min_terminated_length": 114.33333333333333, "entropy": 0.02502539586275816, "epoch": 0.6141826923076923, "frac_reward_zero_std": 0.0, "grad_norm": 0.005539552308619022, "learning_rate": 3.8593749999999996e-07, "loss": -0.0052, "num_tokens": 107031635.0, "reward": 0.7753485639890035, "reward_std": 0.2570799191792806, "rewards/reward_fn/mean": 0.7753485639890035, "rewards/reward_fn/std": 0.25707991421222687, "sampling/importance_sampling_ratio/max": 1.3628885348637898, "sampling/importance_sampling_ratio/mean": 0.43821999430656433, "sampling/importance_sampling_ratio/min": 0.0003947003301618679, "sampling/sampling_logp_difference/max": 2.2025370597839355, "sampling/sampling_logp_difference/mean": 0.005599505578478177, "step": 5110, "step_time": 9.924942033458501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 3000.0, "completions/max_terminated_length": 1943.0, "completions/mean_length": 1229.84375, "completions/mean_terminated_length": 539.8190460205078, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.023071412183344363, "epoch": 0.6153846153846154, "frac_reward_zero_std": 0.0, "grad_norm": 0.0036820562090724707, "learning_rate": 3.847355769230769e-07, "loss": 0.0112, "num_tokens": 107176337.0, "reward": 0.7134757041931152, "reward_std": 0.3067057952284813, "rewards/reward_fn/mean": 0.7134757041931152, "rewards/reward_fn/std": 0.3067057803273201, "sampling/importance_sampling_ratio/max": 1.7474214434623718, "sampling/importance_sampling_ratio/mean": 0.399244949221611, "sampling/importance_sampling_ratio/min": 0.0005659765674863593, "sampling/sampling_logp_difference/max": 2.255835175514221, "sampling/sampling_logp_difference/mean": 0.004715740215033293, "step": 5120, "step_time": 7.015046893432737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3645833333333333, "completions/max_length": 3000.0, "completions/max_terminated_length": 1976.0, "completions/mean_length": 1416.6979166666667, "completions/mean_terminated_length": 507.60227457682294, "completions/min_length": 148.33333333333334, "completions/min_terminated_length": 148.33333333333334, "entropy": 0.022611515037715434, "epoch": 0.6165865384615384, "frac_reward_zero_std": 0.0, "grad_norm": 0.02048870176076889, "learning_rate": 3.8353365384615384e-07, "loss": 0.0049, "num_tokens": 107428356.0, "reward": 0.6528215408325195, "reward_std": 0.2734450399875641, "rewards/reward_fn/mean": 0.6528215408325195, "rewards/reward_fn/std": 0.2734450399875641, "sampling/importance_sampling_ratio/max": 1.2658186554908752, "sampling/importance_sampling_ratio/mean": 0.35076454778512317, "sampling/importance_sampling_ratio/min": 8.87329448081194e-05, "sampling/sampling_logp_difference/max": 3.577969948450724, "sampling/sampling_logp_difference/mean": 0.0046679650743802386, "step": 5130, "step_time": 10.227602544333786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2853.0, "completions/mean_length": 1412.671875, "completions/mean_terminated_length": 796.4034118652344, "completions/min_length": 198.5, "completions/min_terminated_length": 198.5, "entropy": 0.02708234004676342, "epoch": 0.6177884615384616, "frac_reward_zero_std": 0.0, "grad_norm": 0.011372754350304604, "learning_rate": 3.8233173076923073e-07, "loss": -0.008, "num_tokens": 107599687.0, "reward": 0.7233726978302002, "reward_std": 0.2583373263478279, "rewards/reward_fn/mean": 0.7233726978302002, "rewards/reward_fn/std": 0.2583373188972473, "sampling/importance_sampling_ratio/max": 0.9762384295463562, "sampling/importance_sampling_ratio/mean": 0.2549780309200287, "sampling/importance_sampling_ratio/min": 6.017001851432724e-06, "sampling/sampling_logp_difference/max": 3.4180290699005127, "sampling/sampling_logp_difference/mean": 0.005636987742036581, "step": 5140, "step_time": 7.1582794570364054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.23958333333333334, "completions/max_length": 2236.3333333333335, "completions/max_terminated_length": 1184.3333333333333, "completions/mean_length": 1056.6145833333333, "completions/mean_terminated_length": 476.27088419596356, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.024323127605021, "epoch": 0.6189903846153846, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.044147804379463196, "learning_rate": 3.811298076923077e-07, "loss": -0.0266, "num_tokens": 107824682.0, "reward": 0.7679986953735352, "reward_std": 0.223066712419192, "rewards/reward_fn/mean": 0.7679986953735352, "rewards/reward_fn/std": 0.22306672732035318, "sampling/importance_sampling_ratio/max": 1.7825724681218464, "sampling/importance_sampling_ratio/mean": 0.44467733303705853, "sampling/importance_sampling_ratio/min": 0.011024178598442328, "sampling/sampling_logp_difference/max": 3.1936429341634116, "sampling/sampling_logp_difference/mean": 0.005460136880477269, "step": 5150, "step_time": 7.796890005189925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2080.0, "completions/mean_length": 1429.203125, "completions/mean_terminated_length": 665.7251281738281, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.023483704216778277, "epoch": 0.6201923076923077, "frac_reward_zero_std": 0.0, "grad_norm": 0.010731678456068039, "learning_rate": 3.799278846153846e-07, "loss": -0.0069, "num_tokens": 107983575.0, "reward": 0.7253237068653107, "reward_std": 0.2551347017288208, "rewards/reward_fn/mean": 0.7253237068653107, "rewards/reward_fn/std": 0.2551346868276596, "sampling/importance_sampling_ratio/max": 1.1025964319705963, "sampling/importance_sampling_ratio/mean": 0.2785242050886154, "sampling/importance_sampling_ratio/min": 5.71428704461141e-05, "sampling/sampling_logp_difference/max": 1.6114938259124756, "sampling/sampling_logp_difference/mean": 0.004886431619524956, "step": 5160, "step_time": 7.1629976809956135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.14583333333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 2363.6666666666665, "completions/mean_length": 875.5208333333334, "completions/mean_terminated_length": 516.7076619466146, "completions/min_length": 144.66666666666666, "completions/min_terminated_length": 144.66666666666666, "entropy": 0.02040398195385933, "epoch": 0.6213942307692307, "frac_reward_zero_std": 0.25, "grad_norm": 0.005046122707426548, "learning_rate": 3.7872596153846156e-07, "loss": 0.0015, "num_tokens": 108165033.0, "reward": 0.7574612100919088, "reward_std": 0.26551658908526105, "rewards/reward_fn/mean": 0.7574612100919088, "rewards/reward_fn/std": 0.2655165841182073, "sampling/importance_sampling_ratio/max": 1.3808907270431519, "sampling/importance_sampling_ratio/mean": 0.46854350964228314, "sampling/importance_sampling_ratio/min": 0.00010948121098408592, "sampling/sampling_logp_difference/max": 2.6079339186350503, "sampling/sampling_logp_difference/mean": 0.004697275037566821, "step": 5170, "step_time": 10.039740028697997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2224.5, "completions/mean_length": 1369.46875, "completions/mean_terminated_length": 517.7545776367188, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.021750188432633877, "epoch": 0.6225961538461539, "frac_reward_zero_std": 0.0, "grad_norm": 0.011605114676058292, "learning_rate": 3.7752403846153845e-07, "loss": 0.0037, "num_tokens": 108336639.0, "reward": 0.720849335193634, "reward_std": 0.2431546300649643, "rewards/reward_fn/mean": 0.720849335193634, "rewards/reward_fn/std": 0.2431546226143837, "sampling/importance_sampling_ratio/max": 1.1814351081848145, "sampling/importance_sampling_ratio/mean": 0.33218222856521606, "sampling/importance_sampling_ratio/min": 0.0007074041677697096, "sampling/sampling_logp_difference/max": 1.9597123861312866, "sampling/sampling_logp_difference/mean": 0.004010421223938465, "step": 5180, "step_time": 7.369488128926605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2708333333333333, "completions/max_length": 3000.0, "completions/max_terminated_length": 1792.6666666666667, "completions/mean_length": 1207.9166666666667, "completions/mean_terminated_length": 541.8360900878906, "completions/min_length": 139.33333333333334, "completions/min_terminated_length": 139.33333333333334, "entropy": 0.024204087257385255, "epoch": 0.6237980769230769, "frac_reward_zero_std": 0.0, "grad_norm": 0.010484467260539532, "learning_rate": 3.763221153846154e-07, "loss": -0.0053, "num_tokens": 108562167.0, "reward": 0.7428875962893168, "reward_std": 0.2376222014427185, "rewards/reward_fn/mean": 0.7428875962893168, "rewards/reward_fn/std": 0.2376222014427185, "sampling/importance_sampling_ratio/max": 1.4920626878738403, "sampling/importance_sampling_ratio/mean": 0.32768072684605914, "sampling/importance_sampling_ratio/min": 9.267073689045446e-05, "sampling/sampling_logp_difference/max": 1.7404690583546956, "sampling/sampling_logp_difference/mean": 0.0058071174037953215, "step": 5190, "step_time": 10.302342902123929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2573.0, "completions/mean_length": 998.578125, "completions/mean_terminated_length": 500.61492919921875, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.022517410293221473, "epoch": 0.625, "frac_reward_zero_std": 0.0, "grad_norm": 0.013569226488471031, "learning_rate": 3.7512019230769234e-07, "loss": 0.0084, "num_tokens": 108689012.0, "reward": 0.7096470594406128, "reward_std": 0.3145666867494583, "rewards/reward_fn/mean": 0.7096470594406128, "rewards/reward_fn/std": 0.3145666867494583, "sampling/importance_sampling_ratio/max": 1.2145321369171143, "sampling/importance_sampling_ratio/mean": 0.4245801419019699, "sampling/importance_sampling_ratio/min": 0.00022039124087314121, "sampling/sampling_logp_difference/max": 2.3284393548965454, "sampling/sampling_logp_difference/mean": 0.004960199119523168, "step": 5200, "step_time": 6.981926620192826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3229166666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 2147.3333333333335, "completions/mean_length": 1409.9166666666667, "completions/mean_terminated_length": 644.6442464192709, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.023727324418723585, "epoch": 0.6262019230769231, "frac_reward_zero_std": 0.0, "grad_norm": 0.01135667972266674, "learning_rate": 3.7391826923076917e-07, "loss": -0.0041, "num_tokens": 108942724.0, "reward": 0.7256474494934082, "reward_std": 0.260626624027888, "rewards/reward_fn/mean": 0.7256474494934082, "rewards/reward_fn/std": 0.26062661906083423, "sampling/importance_sampling_ratio/max": 1.4343031247456868, "sampling/importance_sampling_ratio/mean": 0.31762611865997314, "sampling/importance_sampling_ratio/min": 5.02064476677333e-05, "sampling/sampling_logp_difference/max": 3.3870912392934165, "sampling/sampling_logp_difference/mean": 0.005141251254826784, "step": 5210, "step_time": 10.380367839802057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3000.0, "completions/max_terminated_length": 2172.0, "completions/mean_length": 1245.71875, "completions/mean_terminated_length": 686.4867858886719, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.025710871815681456, "epoch": 0.6274038461538461, "frac_reward_zero_std": 0.0, "grad_norm": 0.00542575865983963, "learning_rate": 3.727163461538461e-07, "loss": -0.0044, "num_tokens": 109085946.0, "reward": 0.7147109508514404, "reward_std": 0.27297110855579376, "rewards/reward_fn/mean": 0.7147109508514404, "rewards/reward_fn/std": 0.27297112345695496, "sampling/importance_sampling_ratio/max": 1.55776047706604, "sampling/importance_sampling_ratio/mean": 0.321626216173172, "sampling/importance_sampling_ratio/min": 0.0001735564354703456, "sampling/sampling_logp_difference/max": 2.43395334482193, "sampling/sampling_logp_difference/mean": 0.0060181389562785625, "step": 5220, "step_time": 6.987859474029392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2604166666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 1414.0, "completions/mean_length": 1068.1875, "completions/mean_terminated_length": 376.3027852376302, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.023094687424600125, "epoch": 0.6286057692307693, "frac_reward_zero_std": 0.0, "grad_norm": 0.003120486391708255, "learning_rate": 3.7151442307692306e-07, "loss": -0.0078, "num_tokens": 109289588.0, "reward": 0.7508229613304138, "reward_std": 0.2517956793308258, "rewards/reward_fn/mean": 0.7508229613304138, "rewards/reward_fn/std": 0.2517956842978795, "sampling/importance_sampling_ratio/max": 1.564601461092631, "sampling/importance_sampling_ratio/mean": 0.4290776451428731, "sampling/importance_sampling_ratio/min": 7.689519163989189e-05, "sampling/sampling_logp_difference/max": 3.6079951922098794, "sampling/sampling_logp_difference/mean": 0.005508705507963896, "step": 5230, "step_time": 10.156053254660218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2045.5, "completions/mean_length": 722.953125, "completions/mean_terminated_length": 395.8269500732422, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.024931612238287926, "epoch": 0.6298076923076923, "frac_reward_zero_std": 0.0, "grad_norm": 0.02381431683897972, "learning_rate": 3.7031249999999995e-07, "loss": -0.0026, "num_tokens": 109399825.0, "reward": 0.7634036540985107, "reward_std": 0.2163991779088974, "rewards/reward_fn/mean": 0.7634036540985107, "rewards/reward_fn/std": 0.2163991779088974, "sampling/importance_sampling_ratio/max": 1.235736072063446, "sampling/importance_sampling_ratio/mean": 0.500228762626648, "sampling/importance_sampling_ratio/min": 0.0011768130507334718, "sampling/sampling_logp_difference/max": 2.260818123817444, "sampling/sampling_logp_difference/mean": 0.00565376179292798, "step": 5240, "step_time": 6.814514655433595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2604166666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 2257.0, "completions/mean_length": 1119.1041666666667, "completions/mean_terminated_length": 469.50083414713544, "completions/min_length": 115.33333333333333, "completions/min_terminated_length": 115.33333333333333, "entropy": 0.02167375348508358, "epoch": 0.6310096153846154, "frac_reward_zero_std": 0.16666666666666666, "grad_norm": 0.03224974870681763, "learning_rate": 3.691105769230769e-07, "loss": -0.0036, "num_tokens": 109605579.0, "reward": 0.7686025500297546, "reward_std": 0.21343747278054556, "rewards/reward_fn/mean": 0.7686025500297546, "rewards/reward_fn/std": 0.21343747278054556, "sampling/importance_sampling_ratio/max": 1.999392072359721, "sampling/importance_sampling_ratio/mean": 0.4505297839641571, "sampling/importance_sampling_ratio/min": 6.164307520369523e-05, "sampling/sampling_logp_difference/max": 2.1485151847203574, "sampling/sampling_logp_difference/mean": 0.0045158335318168, "step": 5250, "step_time": 10.156536932010203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 3000.0, "completions/max_terminated_length": 2208.5, "completions/mean_length": 811.71875, "completions/mean_terminated_length": 406.4814910888672, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.02359228953719139, "epoch": 0.6322115384615384, "frac_reward_zero_std": 0.125, "grad_norm": 0.0064560058526694775, "learning_rate": 3.6790865384615383e-07, "loss": -0.0058, "num_tokens": 109730921.0, "reward": 0.7178639769554138, "reward_std": 0.3137791305780411, "rewards/reward_fn/mean": 0.7178639769554138, "rewards/reward_fn/std": 0.3137791305780411, "sampling/importance_sampling_ratio/max": 1.1466519832611084, "sampling/importance_sampling_ratio/mean": 0.45643897354602814, "sampling/importance_sampling_ratio/min": 1.610154458830948e-05, "sampling/sampling_logp_difference/max": 1.2953029870986938, "sampling/sampling_logp_difference/mean": 0.005118751898407936, "step": 5260, "step_time": 6.897609099559486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 3000.0, "completions/max_terminated_length": 1957.3333333333333, "completions/mean_length": 845.3229166666666, "completions/mean_terminated_length": 456.80137125651044, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.02245311290025711, "epoch": 0.6334134615384616, "frac_reward_zero_std": 0.0, "grad_norm": 0.02181435376405716, "learning_rate": 3.667067307692308e-07, "loss": -0.0035, "num_tokens": 109900896.0, "reward": 0.8002712925275167, "reward_std": 0.2258764108022054, "rewards/reward_fn/mean": 0.8002712925275167, "rewards/reward_fn/std": 0.22587642073631287, "sampling/importance_sampling_ratio/max": 1.8622669378916423, "sampling/importance_sampling_ratio/mean": 0.46423621972401935, "sampling/importance_sampling_ratio/min": 0.0006233263096267668, "sampling/sampling_logp_difference/max": 1.655385692914327, "sampling/sampling_logp_difference/mean": 0.0050649214535951614, "step": 5270, "step_time": 10.125968467164785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 3000.0, "completions/max_terminated_length": 1930.0, "completions/mean_length": 969.375, "completions/mean_terminated_length": 548.5563049316406, "completions/min_length": 124.5, "completions/min_terminated_length": 124.5, "entropy": 0.024705729633569717, "epoch": 0.6346153846153846, "frac_reward_zero_std": 0.0, "grad_norm": 0.006557064596563578, "learning_rate": 3.6550480769230767e-07, "loss": 0.0105, "num_tokens": 110019968.0, "reward": 0.7958244681358337, "reward_std": 0.21785727888345718, "rewards/reward_fn/mean": 0.7958244681358337, "rewards/reward_fn/std": 0.21785727888345718, "sampling/importance_sampling_ratio/max": 1.758977472782135, "sampling/importance_sampling_ratio/mean": 0.35639315843582153, "sampling/importance_sampling_ratio/min": 2.583505306574807e-06, "sampling/sampling_logp_difference/max": 1.656272828578949, "sampling/sampling_logp_difference/mean": 0.0057868049480021, "step": 5280, "step_time": 7.081683076918125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3000.0, "completions/max_terminated_length": 1491.0, "completions/mean_length": 1056.96875, "completions/mean_terminated_length": 427.98838297526044, "completions/min_length": 135.66666666666666, "completions/min_terminated_length": 135.66666666666666, "entropy": 0.02412305921316147, "epoch": 0.6358173076923077, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.010397780686616898, "learning_rate": 3.643028846153846e-07, "loss": 0.0036, "num_tokens": 110230653.0, "reward": 0.7825018763542175, "reward_std": 0.20095142722129822, "rewards/reward_fn/mean": 0.7825018763542175, "rewards/reward_fn/std": 0.20095142473777136, "sampling/importance_sampling_ratio/max": 2.08659565448761, "sampling/importance_sampling_ratio/mean": 0.45597731073697406, "sampling/importance_sampling_ratio/min": 0.0010791726723458812, "sampling/sampling_logp_difference/max": 2.315369407335917, "sampling/sampling_logp_difference/mean": 0.0053095716672639055, "step": 5290, "step_time": 10.158486024383455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 3000.0, "completions/max_terminated_length": 2119.5, "completions/mean_length": 1263.546875, "completions/mean_terminated_length": 621.0961608886719, "completions/min_length": 102.5, "completions/min_terminated_length": 102.5, "entropy": 0.024738151207566263, "epoch": 0.6370192307692307, "frac_reward_zero_std": 0.0, "grad_norm": 0.00948407594114542, "learning_rate": 3.6310096153846155e-07, "loss": -0.0023, "num_tokens": 110387952.0, "reward": 0.6631031930446625, "reward_std": 0.33382847905158997, "rewards/reward_fn/mean": 0.6631031930446625, "rewards/reward_fn/std": 0.33382846415042877, "sampling/importance_sampling_ratio/max": 1.4105368852615356, "sampling/importance_sampling_ratio/mean": 0.32597118616104126, "sampling/importance_sampling_ratio/min": 1.896088454600431e-05, "sampling/sampling_logp_difference/max": 2.2322381734848022, "sampling/sampling_logp_difference/mean": 0.005561214638873935, "step": 5300, "step_time": 7.1010569194331765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.16666666666666666, "completions/max_length": 3000.0, "completions/max_terminated_length": 1580.0, "completions/mean_length": 854.4895833333334, "completions/mean_terminated_length": 427.7346496582031, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.020467922650277616, "epoch": 0.6382211538461539, "frac_reward_zero_std": 0.16666666666666666, "grad_norm": 0.04768664389848709, "learning_rate": 3.6189903846153844e-07, "loss": -0.0064, "num_tokens": 110582455.0, "reward": 0.6687557200590769, "reward_std": 0.26919760803381604, "rewards/reward_fn/mean": 0.6687557200590769, "rewards/reward_fn/std": 0.26919760803381604, "sampling/importance_sampling_ratio/max": 1.9492392539978027, "sampling/importance_sampling_ratio/mean": 0.5137519041697184, "sampling/importance_sampling_ratio/min": 0.0001381159327138448, "sampling/sampling_logp_difference/max": 2.337810436884562, "sampling/sampling_logp_difference/mean": 0.005464815689871709, "step": 5310, "step_time": 9.7649396087043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 3000.0, "completions/max_terminated_length": 2079.0, "completions/mean_length": 1562.9375, "completions/mean_terminated_length": 674.0803833007812, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.02485385723412037, "epoch": 0.6394230769230769, "frac_reward_zero_std": 0.0, "grad_norm": 0.015483393333852291, "learning_rate": 3.606971153846154e-07, "loss": 0.0034, "num_tokens": 110753539.0, "reward": 0.6425249874591827, "reward_std": 0.30052758753299713, "rewards/reward_fn/mean": 0.6425249874591827, "rewards/reward_fn/std": 0.30052758753299713, "sampling/importance_sampling_ratio/max": 1.2947282195091248, "sampling/importance_sampling_ratio/mean": 0.2565772794187069, "sampling/importance_sampling_ratio/min": 6.553182629431831e-06, "sampling/sampling_logp_difference/max": 2.4487966299057007, "sampling/sampling_logp_difference/mean": 0.004969417117536068, "step": 5320, "step_time": 7.177024411503226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 3000.0, "completions/max_terminated_length": 2047.6666666666667, "completions/mean_length": 1017.09375, "completions/mean_terminated_length": 458.4799499511719, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.02283474327996373, "epoch": 0.640625, "frac_reward_zero_std": 0.0, "grad_norm": 0.011319751851260662, "learning_rate": 3.5949519230769233e-07, "loss": -0.0007, "num_tokens": 110956644.0, "reward": 0.7744050224622091, "reward_std": 0.24233123660087585, "rewards/reward_fn/mean": 0.7744050224622091, "rewards/reward_fn/std": 0.2423312266667684, "sampling/importance_sampling_ratio/max": 1.9684813419977825, "sampling/importance_sampling_ratio/mean": 0.40643344322840375, "sampling/importance_sampling_ratio/min": 1.405092295196179e-05, "sampling/sampling_logp_difference/max": 2.666784962018331, "sampling/sampling_logp_difference/mean": 0.005648438508311908, "step": 5330, "step_time": 10.364925489388407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 3000.0, "completions/max_terminated_length": 1730.5, "completions/mean_length": 980.84375, "completions/mean_terminated_length": 375.9200439453125, "completions/min_length": 116.5, "completions/min_terminated_length": 116.5, "entropy": 0.027252384461462498, "epoch": 0.6418269230769231, "frac_reward_zero_std": 0.0, "grad_norm": 0.01741604506969452, "learning_rate": 3.582932692307692e-07, "loss": -0.0107, "num_tokens": 111097786.0, "reward": 0.7042264342308044, "reward_std": 0.2924465090036392, "rewards/reward_fn/mean": 0.7042264342308044, "rewards/reward_fn/std": 0.2924465015530586, "sampling/importance_sampling_ratio/max": 1.7181238532066345, "sampling/importance_sampling_ratio/mean": 0.4300633668899536, "sampling/importance_sampling_ratio/min": 0.0002588243005448021, "sampling/sampling_logp_difference/max": 2.447130024433136, "sampling/sampling_logp_difference/mean": 0.0057995079550892115, "step": 5340, "step_time": 6.992954677995295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 3000.0, "completions/max_terminated_length": 2050.0, "completions/mean_length": 1055.0520833333333, "completions/mean_terminated_length": 607.5323486328125, "completions/min_length": 156.66666666666666, "completions/min_terminated_length": 156.66666666666666, "entropy": 0.0249421251937747, "epoch": 0.6430288461538461, "frac_reward_zero_std": 0.0, "grad_norm": 0.02957952581346035, "learning_rate": 3.570913461538461e-07, "loss": 0.0011, "num_tokens": 111301567.0, "reward": 0.7049202720324198, "reward_std": 0.2934993306795756, "rewards/reward_fn/mean": 0.7049202720324198, "rewards/reward_fn/std": 0.29349933564662933, "sampling/importance_sampling_ratio/max": 1.0975364844004314, "sampling/importance_sampling_ratio/mean": 0.28262267510096234, "sampling/importance_sampling_ratio/min": 3.774749025827381e-05, "sampling/sampling_logp_difference/max": 3.37062931060791, "sampling/sampling_logp_difference/mean": 0.0055261775851249695, "step": 5350, "step_time": 10.410921028163283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2484.0, "completions/mean_length": 1062.328125, "completions/mean_terminated_length": 574.9317016601562, "completions/min_length": 122.5, "completions/min_terminated_length": 122.5, "entropy": 0.022305554896593093, "epoch": 0.6442307692307693, "frac_reward_zero_std": 0.0, "grad_norm": 0.008551722392439842, "learning_rate": 3.5588942307692305e-07, "loss": -0.0067, "num_tokens": 111439060.0, "reward": 0.6975843608379364, "reward_std": 0.34132616221904755, "rewards/reward_fn/mean": 0.6975843608379364, "rewards/reward_fn/std": 0.34132614731788635, "sampling/importance_sampling_ratio/max": 1.7665472626686096, "sampling/importance_sampling_ratio/mean": 0.40735478699207306, "sampling/importance_sampling_ratio/min": 1.0688175336781569e-05, "sampling/sampling_logp_difference/max": 3.979189157485962, "sampling/sampling_logp_difference/mean": 0.005729941185563803, "step": 5360, "step_time": 6.933439264819026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3854166666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 2097.0, "completions/mean_length": 1564.3020833333333, "completions/mean_terminated_length": 719.2113952636719, "completions/min_length": 136.66666666666666, "completions/min_terminated_length": 136.66666666666666, "entropy": 0.02334568854421377, "epoch": 0.6454326923076923, "frac_reward_zero_std": 0.0, "grad_norm": 0.011017199605703354, "learning_rate": 3.546875e-07, "loss": -0.0022, "num_tokens": 111710665.0, "reward": 0.689439594745636, "reward_std": 0.2584569404522578, "rewards/reward_fn/mean": 0.689439594745636, "rewards/reward_fn/std": 0.2584569404522578, "sampling/importance_sampling_ratio/max": 1.4997199773788452, "sampling/importance_sampling_ratio/mean": 0.25768092771371204, "sampling/importance_sampling_ratio/min": 4.447192516939443e-05, "sampling/sampling_logp_difference/max": 1.8895780642827351, "sampling/sampling_logp_difference/mean": 0.005360551488896211, "step": 5370, "step_time": 10.371898875851183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2417.0, "completions/mean_length": 1057.28125, "completions/mean_terminated_length": 472.8452453613281, "completions/min_length": 128.5, "completions/min_terminated_length": 128.5, "entropy": 0.02416342180222273, "epoch": 0.6466346153846154, "frac_reward_zero_std": 0.0, "grad_norm": 0.008731910958886147, "learning_rate": 3.534855769230769e-07, "loss": -0.0046, "num_tokens": 111858187.0, "reward": 0.7382872998714447, "reward_std": 0.2456808015704155, "rewards/reward_fn/mean": 0.7382872998714447, "rewards/reward_fn/std": 0.2456807941198349, "sampling/importance_sampling_ratio/max": 1.8547605872154236, "sampling/importance_sampling_ratio/mean": 0.3877914845943451, "sampling/importance_sampling_ratio/min": 1.6430876712547615e-05, "sampling/sampling_logp_difference/max": 3.780111074447632, "sampling/sampling_logp_difference/mean": 0.0050512151792645454, "step": 5380, "step_time": 6.893091963045299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.20833333333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 1667.3333333333333, "completions/mean_length": 995.8645833333334, "completions/mean_terminated_length": 469.31891377766925, "completions/min_length": 136.66666666666666, "completions/min_terminated_length": 136.66666666666666, "entropy": 0.026919155567884444, "epoch": 0.6478365384615384, "frac_reward_zero_std": 0.0, "grad_norm": 0.02430771477520466, "learning_rate": 3.522836538461538e-07, "loss": -0.0102, "num_tokens": 112066470.0, "reward": 0.7972240050633749, "reward_std": 0.20331869522730509, "rewards/reward_fn/mean": 0.7972240050633749, "rewards/reward_fn/std": 0.20331869522730509, "sampling/importance_sampling_ratio/max": 1.6477829019228618, "sampling/importance_sampling_ratio/mean": 0.3649936318397522, "sampling/importance_sampling_ratio/min": 0.0001269853454080779, "sampling/sampling_logp_difference/max": 3.6386834383010864, "sampling/sampling_logp_difference/mean": 0.006189562535534303, "step": 5390, "step_time": 10.188632221799343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 3000.0, "completions/max_terminated_length": 1746.5, "completions/mean_length": 1036.65625, "completions/mean_terminated_length": 538.2245635986328, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.030977327562868594, "epoch": 0.6490384615384616, "frac_reward_zero_std": 0.0, "grad_norm": 0.006741190794855356, "learning_rate": 3.5108173076923077e-07, "loss": -0.0017, "num_tokens": 112211976.0, "reward": 0.7659538984298706, "reward_std": 0.24625447392463684, "rewards/reward_fn/mean": 0.7659538984298706, "rewards/reward_fn/std": 0.24625445902347565, "sampling/importance_sampling_ratio/max": 1.091595470905304, "sampling/importance_sampling_ratio/mean": 0.2271578013896942, "sampling/importance_sampling_ratio/min": 1.874046938610263e-05, "sampling/sampling_logp_difference/max": 3.4866085052490234, "sampling/sampling_logp_difference/mean": 0.006558760767802596, "step": 5400, "step_time": 7.013961601443588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2916666666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 1457.3333333333333, "completions/mean_length": 1199.5, "completions/mean_terminated_length": 459.43470255533856, "completions/min_length": 148.33333333333334, "completions/min_terminated_length": 148.33333333333334, "entropy": 0.023803429491817952, "epoch": 0.6502403846153846, "frac_reward_zero_std": 0.0, "grad_norm": 0.007461231201887131, "learning_rate": 3.4987980769230766e-07, "loss": -0.0039, "num_tokens": 112436976.0, "reward": 0.7303009629249573, "reward_std": 0.2597893526156743, "rewards/reward_fn/mean": 0.7303009629249573, "rewards/reward_fn/std": 0.2597893526156743, "sampling/importance_sampling_ratio/max": 1.479163944721222, "sampling/importance_sampling_ratio/mean": 0.3633896013100942, "sampling/importance_sampling_ratio/min": 7.186237053247169e-05, "sampling/sampling_logp_difference/max": 3.117318034172058, "sampling/sampling_logp_difference/mean": 0.005381275744487842, "step": 5410, "step_time": 10.25793440276757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 3000.0, "completions/max_terminated_length": 1728.0, "completions/mean_length": 847.21875, "completions/mean_terminated_length": 401.31214904785156, "completions/min_length": 98.5, "completions/min_terminated_length": 98.5, "entropy": 0.024934536404907705, "epoch": 0.6514423076923077, "frac_reward_zero_std": 0.25, "grad_norm": 0.008757857605814934, "learning_rate": 3.486778846153846e-07, "loss": -0.0044, "num_tokens": 112575454.0, "reward": 0.8334181308746338, "reward_std": 0.17193763703107834, "rewards/reward_fn/mean": 0.8334181308746338, "rewards/reward_fn/std": 0.17193763703107834, "sampling/importance_sampling_ratio/max": 1.1911499202251434, "sampling/importance_sampling_ratio/mean": 0.4202286899089813, "sampling/importance_sampling_ratio/min": 0.0002717119014050695, "sampling/sampling_logp_difference/max": 1.5169521570205688, "sampling/sampling_logp_difference/mean": 0.0052175133023411036, "step": 5420, "step_time": 7.009736295603216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2259.0, "completions/mean_length": 1313.7395833333333, "completions/mean_terminated_length": 563.5641276041666, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.019934427458792925, "epoch": 0.6526442307692307, "frac_reward_zero_std": 0.16666666666666666, "grad_norm": 0.002740197815001011, "learning_rate": 3.4747596153846154e-07, "loss": 0.001, "num_tokens": 112817229.0, "reward": 0.5857817927996317, "reward_std": 0.2797772487004598, "rewards/reward_fn/mean": 0.5857817927996317, "rewards/reward_fn/std": 0.27977722386519116, "sampling/importance_sampling_ratio/max": 1.2734482288360596, "sampling/importance_sampling_ratio/mean": 0.4093297521273295, "sampling/importance_sampling_ratio/min": 0.000333162505285145, "sampling/sampling_logp_difference/max": 3.405109087626139, "sampling/sampling_logp_difference/mean": 0.004417288427551587, "step": 5430, "step_time": 10.447181465383618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2725.0, "completions/mean_length": 1249.84375, "completions/mean_terminated_length": 802.2361145019531, "completions/min_length": 167.5, "completions/min_terminated_length": 167.5, "entropy": 0.024215361289680003, "epoch": 0.6538461538461539, "frac_reward_zero_std": 0.0, "grad_norm": 0.009081769734621048, "learning_rate": 3.462740384615385e-07, "loss": -0.0006, "num_tokens": 112966627.0, "reward": 0.7820249199867249, "reward_std": 0.24363940954208374, "rewards/reward_fn/mean": 0.7820249199867249, "rewards/reward_fn/std": 0.24363938719034195, "sampling/importance_sampling_ratio/max": 0.8800358176231384, "sampling/importance_sampling_ratio/mean": 0.20848453044891357, "sampling/importance_sampling_ratio/min": 8.12202924862504e-05, "sampling/sampling_logp_difference/max": 3.37280809879303, "sampling/sampling_logp_difference/mean": 0.005467945942655206, "step": 5440, "step_time": 7.066511819604784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.19791666666666666, "completions/max_length": 3000.0, "completions/max_terminated_length": 1631.3333333333333, "completions/mean_length": 896.6458333333334, "completions/mean_terminated_length": 385.7272542317708, "completions/min_length": 112.66666666666667, "completions/min_terminated_length": 112.66666666666667, "entropy": 0.022099135257303715, "epoch": 0.6550480769230769, "frac_reward_zero_std": 0.0, "grad_norm": 0.008360368199646473, "learning_rate": 3.450721153846154e-07, "loss": -0.0001, "num_tokens": 113162849.0, "reward": 0.7709751526514689, "reward_std": 0.22816592951615652, "rewards/reward_fn/mean": 0.7709751526514689, "rewards/reward_fn/std": 0.22816592951615652, "sampling/importance_sampling_ratio/max": 2.0776341756184897, "sampling/importance_sampling_ratio/mean": 0.4851218561331431, "sampling/importance_sampling_ratio/min": 0.00711119442985364, "sampling/sampling_logp_difference/max": 2.1288894017537436, "sampling/sampling_logp_difference/mean": 0.0051001507478455705, "step": 5450, "step_time": 9.789397844020277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2733.5, "completions/mean_length": 1502.578125, "completions/mean_terminated_length": 807.5320739746094, "completions/min_length": 153.5, "completions/min_terminated_length": 153.5, "entropy": 0.023934657499194146, "epoch": 0.65625, "frac_reward_zero_std": 0.0, "grad_norm": 0.024131815880537033, "learning_rate": 3.438701923076923e-07, "loss": -0.0006, "num_tokens": 113323894.0, "reward": 0.6533952355384827, "reward_std": 0.30592963099479675, "rewards/reward_fn/mean": 0.6533952355384827, "rewards/reward_fn/std": 0.30592963099479675, "sampling/importance_sampling_ratio/max": 1.339994341135025, "sampling/importance_sampling_ratio/mean": 0.2592371702194214, "sampling/importance_sampling_ratio/min": 0.0007806452142631315, "sampling/sampling_logp_difference/max": 2.793313443660736, "sampling/sampling_logp_difference/mean": 0.005434292135760188, "step": 5460, "step_time": 6.97749703777954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.17708333333333334, "completions/max_length": 2507.6666666666665, "completions/max_terminated_length": 1550.3333333333333, "completions/mean_length": 886.8541666666666, "completions/mean_terminated_length": 442.8662516276042, "completions/min_length": 135.66666666666666, "completions/min_terminated_length": 135.66666666666666, "entropy": 0.023269068636000156, "epoch": 0.6574519230769231, "frac_reward_zero_std": 0.0, "grad_norm": 0.07296942174434662, "learning_rate": 3.426682692307692e-07, "loss": 0.0013, "num_tokens": 113520920.0, "reward": 0.7955960830052694, "reward_std": 0.18334108094374338, "rewards/reward_fn/mean": 0.7955960830052694, "rewards/reward_fn/std": 0.18334108094374338, "sampling/importance_sampling_ratio/max": 1.9518317381540935, "sampling/importance_sampling_ratio/mean": 0.4777453541755676, "sampling/importance_sampling_ratio/min": 0.0034631069502211176, "sampling/sampling_logp_difference/max": 1.709571361541748, "sampling/sampling_logp_difference/mean": 0.005548880901187658, "step": 5470, "step_time": 8.584785458259285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2811.5, "completions/mean_length": 1367.5, "completions/mean_terminated_length": 625.4545593261719, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.02377788759768009, "epoch": 0.6586538461538461, "frac_reward_zero_std": 0.125, "grad_norm": 0.004384294152259827, "learning_rate": 3.414663461538461e-07, "loss": 0.0027, "num_tokens": 113696488.0, "reward": 0.6266945600509644, "reward_std": 0.3109823316335678, "rewards/reward_fn/mean": 0.6266945600509644, "rewards/reward_fn/std": 0.3109823316335678, "sampling/importance_sampling_ratio/max": 0.8643659353256226, "sampling/importance_sampling_ratio/mean": 0.24346565455198288, "sampling/importance_sampling_ratio/min": 4.352251607997459e-06, "sampling/sampling_logp_difference/max": 3.1997915506362915, "sampling/sampling_logp_difference/mean": 0.005234864307567477, "step": 5480, "step_time": 7.09747653491795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 3000.0, "completions/max_terminated_length": 1994.0, "completions/mean_length": 1036.4479166666667, "completions/mean_terminated_length": 548.2347615559896, "completions/min_length": 100.33333333333333, "completions/min_terminated_length": 100.33333333333333, "entropy": 0.02122503872960806, "epoch": 0.6598557692307693, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.014303735457360744, "learning_rate": 3.4026442307692304e-07, "loss": 0.0037, "num_tokens": 113902715.0, "reward": 0.6956583460172018, "reward_std": 0.31013935307661694, "rewards/reward_fn/mean": 0.6956583460172018, "rewards/reward_fn/std": 0.31013935307661694, "sampling/importance_sampling_ratio/max": 1.4489672581354778, "sampling/importance_sampling_ratio/mean": 0.483482946952184, "sampling/importance_sampling_ratio/min": 0.0033071409142166885, "sampling/sampling_logp_difference/max": 3.8374552726745605, "sampling/sampling_logp_difference/mean": 0.004862317815423012, "step": 5490, "step_time": 9.929896081611513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 3000.0, "completions/max_terminated_length": 2015.5, "completions/mean_length": 940.84375, "completions/mean_terminated_length": 465.65386962890625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.023905313201248647, "epoch": 0.6610576923076923, "frac_reward_zero_std": 0.0, "grad_norm": 0.026251239702105522, "learning_rate": 3.390625e-07, "loss": -0.0029, "num_tokens": 114031361.0, "reward": 0.7922484278678894, "reward_std": 0.21420110017061234, "rewards/reward_fn/mean": 0.7922484278678894, "rewards/reward_fn/std": 0.21420109272003174, "sampling/importance_sampling_ratio/max": 1.5068643689155579, "sampling/importance_sampling_ratio/mean": 0.43131671845912933, "sampling/importance_sampling_ratio/min": 5.981033064017538e-05, "sampling/sampling_logp_difference/max": 1.7255179286003113, "sampling/sampling_logp_difference/mean": 0.005602387245744467, "step": 5500, "step_time": 7.146138022746891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3541666666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 1822.3333333333333, "completions/mean_length": 1473.0208333333333, "completions/mean_terminated_length": 642.8172607421875, "completions/min_length": 118.33333333333333, "completions/min_terminated_length": 118.33333333333333, "entropy": 0.020534697733819485, "epoch": 0.6622596153846154, "frac_reward_zero_std": 0.0, "grad_norm": 0.0022270644549280405, "learning_rate": 3.378605769230769e-07, "loss": 0.0007, "num_tokens": 114278739.0, "reward": 0.7042433818181356, "reward_std": 0.2600280890862147, "rewards/reward_fn/mean": 0.7042433818181356, "rewards/reward_fn/std": 0.26002808411916095, "sampling/importance_sampling_ratio/max": 1.9951835076014202, "sampling/importance_sampling_ratio/mean": 0.3290434678395589, "sampling/importance_sampling_ratio/min": 7.071527609999369e-05, "sampling/sampling_logp_difference/max": 4.002086639404297, "sampling/sampling_logp_difference/mean": 0.0050744412777324515, "step": 5510, "step_time": 10.485510316025465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 3000.0, "completions/max_terminated_length": 1989.0, "completions/mean_length": 1103.765625, "completions/mean_terminated_length": 508.20240783691406, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.02140685971826315, "epoch": 0.6634615384615384, "frac_reward_zero_std": 0.0, "grad_norm": 0.038663141429424286, "learning_rate": 3.366586538461538e-07, "loss": -0.0127, "num_tokens": 114405068.0, "reward": 0.6400830745697021, "reward_std": 0.3218672424554825, "rewards/reward_fn/mean": 0.6400830745697021, "rewards/reward_fn/std": 0.3218672573566437, "sampling/importance_sampling_ratio/max": 2.2264792919158936, "sampling/importance_sampling_ratio/mean": 0.4534348249435425, "sampling/importance_sampling_ratio/min": 2.6127770524908556e-07, "sampling/sampling_logp_difference/max": 3.97992205619812, "sampling/sampling_logp_difference/mean": 0.005366598488762975, "step": 5520, "step_time": 7.07894898597151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.19791666666666666, "completions/max_length": 3000.0, "completions/max_terminated_length": 2548.6666666666665, "completions/mean_length": 1083.9583333333333, "completions/mean_terminated_length": 608.7867024739584, "completions/min_length": 113.33333333333333, "completions/min_terminated_length": 113.33333333333333, "entropy": 0.023987972736358644, "epoch": 0.6646634615384616, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.00493884552270174, "learning_rate": 3.3545673076923076e-07, "loss": -0.0033, "num_tokens": 114635384.0, "reward": 0.7723173300425211, "reward_std": 0.23324964940547943, "rewards/reward_fn/mean": 0.7723173300425211, "rewards/reward_fn/std": 0.2332496444384257, "sampling/importance_sampling_ratio/max": 1.9859758218129475, "sampling/importance_sampling_ratio/mean": 0.4494105080763499, "sampling/importance_sampling_ratio/min": 0.0004301047883927822, "sampling/sampling_logp_difference/max": 2.4641403357187905, "sampling/sampling_logp_difference/mean": 0.004544943027819197, "step": 5530, "step_time": 10.150983258523047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 3000.0, "completions/max_terminated_length": 2385.0, "completions/mean_length": 1024.21875, "completions/mean_terminated_length": 630.3703155517578, "completions/min_length": 149.5, "completions/min_terminated_length": 149.5, "entropy": 0.026709275506436824, "epoch": 0.6658653846153846, "frac_reward_zero_std": 0.0, "grad_norm": 0.0061820936389267445, "learning_rate": 3.342548076923077e-07, "loss": 0.0052, "num_tokens": 114775686.0, "reward": 0.7762992978096008, "reward_std": 0.22920618206262589, "rewards/reward_fn/mean": 0.7762992978096008, "rewards/reward_fn/std": 0.2292061671614647, "sampling/importance_sampling_ratio/max": 1.5113651752471924, "sampling/importance_sampling_ratio/mean": 0.3191165030002594, "sampling/importance_sampling_ratio/min": 1.7391224560014962e-05, "sampling/sampling_logp_difference/max": 3.534265398979187, "sampling/sampling_logp_difference/mean": 0.006782964803278446, "step": 5540, "step_time": 6.9614929761737585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.14583333333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 1683.3333333333333, "completions/mean_length": 755.9166666666666, "completions/mean_terminated_length": 379.0575358072917, "completions/min_length": 120.33333333333333, "completions/min_terminated_length": 120.33333333333333, "entropy": 0.027185207605361937, "epoch": 0.6670673076923077, "frac_reward_zero_std": 0.0, "grad_norm": 0.00705135939642787, "learning_rate": 3.330528846153846e-07, "loss": -0.0051, "num_tokens": 114946654.0, "reward": 0.8046176632245382, "reward_std": 0.19858131309350333, "rewards/reward_fn/mean": 0.8046176632245382, "rewards/reward_fn/std": 0.19858131309350333, "sampling/importance_sampling_ratio/max": 1.5388845602671306, "sampling/importance_sampling_ratio/mean": 0.4815067648887634, "sampling/importance_sampling_ratio/min": 0.00010731482567886512, "sampling/sampling_logp_difference/max": 1.9148115317026775, "sampling/sampling_logp_difference/mean": 0.006181707450499137, "step": 5550, "step_time": 9.868457819800824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 3000.0, "completions/max_terminated_length": 1945.0, "completions/mean_length": 1048.375, "completions/mean_terminated_length": 597.762939453125, "completions/min_length": 164.5, "completions/min_terminated_length": 164.5, "entropy": 0.024501992017030717, "epoch": 0.6682692307692307, "frac_reward_zero_std": 0.0, "grad_norm": 0.00964804645627737, "learning_rate": 3.3185096153846153e-07, "loss": -0.0005, "num_tokens": 115085814.0, "reward": 0.7902845144271851, "reward_std": 0.23293913155794144, "rewards/reward_fn/mean": 0.7902845144271851, "rewards/reward_fn/std": 0.23293912410736084, "sampling/importance_sampling_ratio/max": 1.751549482345581, "sampling/importance_sampling_ratio/mean": 0.37538258731365204, "sampling/importance_sampling_ratio/min": 7.6031551543565e-06, "sampling/sampling_logp_difference/max": 1.9075912237167358, "sampling/sampling_logp_difference/mean": 0.005414825165644288, "step": 5560, "step_time": 6.81088263085112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 3000.0, "completions/max_terminated_length": 2180.3333333333335, "completions/mean_length": 1046.6979166666667, "completions/mean_terminated_length": 688.9560343424479, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.020830578915774823, "epoch": 0.6694711538461539, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.011004123836755753, "learning_rate": 3.306490384615385e-07, "loss": 0.0004, "num_tokens": 115279001.0, "reward": 0.7380185723304749, "reward_std": 0.2839358498652776, "rewards/reward_fn/mean": 0.7380185723304749, "rewards/reward_fn/std": 0.2839358349641164, "sampling/importance_sampling_ratio/max": 1.2804924249649048, "sampling/importance_sampling_ratio/mean": 0.3918954332669576, "sampling/importance_sampling_ratio/min": 0.00035972810019302415, "sampling/sampling_logp_difference/max": 1.8124979734420776, "sampling/sampling_logp_difference/mean": 0.005071153864264488, "step": 5570, "step_time": 10.073755049426108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 3000.0, "completions/max_terminated_length": 1525.5, "completions/mean_length": 1206.140625, "completions/mean_terminated_length": 566.0055236816406, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.02515082899481058, "epoch": 0.6706730769230769, "frac_reward_zero_std": 0.0, "grad_norm": 0.008007106371223927, "learning_rate": 3.294471153846154e-07, "loss": 0.0045, "num_tokens": 115428610.0, "reward": 0.7119818031787872, "reward_std": 0.2624135836958885, "rewards/reward_fn/mean": 0.7119818031787872, "rewards/reward_fn/std": 0.2624135762453079, "sampling/importance_sampling_ratio/max": 1.7582178115844727, "sampling/importance_sampling_ratio/mean": 0.3302469998598099, "sampling/importance_sampling_ratio/min": 0.00027984995540464297, "sampling/sampling_logp_difference/max": 2.706644654273987, "sampling/sampling_logp_difference/mean": 0.005634930916130543, "step": 5580, "step_time": 7.180367001052946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.11458333333333333, "completions/max_length": 3000.0, "completions/max_terminated_length": 1968.6666666666667, "completions/mean_length": 805.7083333333334, "completions/mean_terminated_length": 522.4891764322916, "completions/min_length": 131.33333333333334, "completions/min_terminated_length": 131.33333333333334, "entropy": 0.02154187113046646, "epoch": 0.671875, "frac_reward_zero_std": 0.16666666666666666, "grad_norm": 0.022004179656505585, "learning_rate": 3.282451923076923e-07, "loss": -0.0031, "num_tokens": 115616822.0, "reward": 0.6418007612228394, "reward_std": 0.3810599346955617, "rewards/reward_fn/mean": 0.6418007612228394, "rewards/reward_fn/std": 0.3810599346955617, "sampling/importance_sampling_ratio/max": 1.3404910167058308, "sampling/importance_sampling_ratio/mean": 0.4898800849914551, "sampling/importance_sampling_ratio/min": 0.0008518040698011949, "sampling/sampling_logp_difference/max": 1.655112346013387, "sampling/sampling_logp_difference/mean": 0.004879883490502834, "step": 5590, "step_time": 10.065056884847582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 3000.0, "completions/max_terminated_length": 1302.0, "completions/mean_length": 827.5, "completions/mean_terminated_length": 323.94073486328125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.02074054330587387, "epoch": 0.6730769230769231, "frac_reward_zero_std": 0.25, "grad_norm": 0.018275950103998184, "learning_rate": 3.270432692307692e-07, "loss": -0.0096, "num_tokens": 115742838.0, "reward": 0.6091831922531128, "reward_std": 0.3790973275899887, "rewards/reward_fn/mean": 0.6091831922531128, "rewards/reward_fn/std": 0.3790973126888275, "sampling/importance_sampling_ratio/max": 1.7012999057769775, "sampling/importance_sampling_ratio/mean": 0.5621017813682556, "sampling/importance_sampling_ratio/min": 4.956829116053996e-06, "sampling/sampling_logp_difference/max": 1.3375742435455322, "sampling/sampling_logp_difference/mean": 0.0042484470177441835, "step": 5600, "step_time": 7.354970640782267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.19791666666666666, "completions/max_length": 3000.0, "completions/max_terminated_length": 1478.6666666666667, "completions/mean_length": 932.6145833333334, "completions/mean_terminated_length": 407.53468322753906, "completions/min_length": 104.33333333333333, "completions/min_terminated_length": 104.33333333333333, "entropy": 0.02253870889544487, "epoch": 0.6742788461538461, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.012986783869564533, "learning_rate": 3.2584134615384614e-07, "loss": -0.0158, "num_tokens": 115935553.0, "reward": 0.6604377925395966, "reward_std": 0.271195724606514, "rewards/reward_fn/mean": 0.6604377925395966, "rewards/reward_fn/std": 0.2711957295735677, "sampling/importance_sampling_ratio/max": 1.3425262769063313, "sampling/importance_sampling_ratio/mean": 0.45382200678189594, "sampling/importance_sampling_ratio/min": 1.8415154045214877e-05, "sampling/sampling_logp_difference/max": 2.1413347721099854, "sampling/sampling_logp_difference/mean": 0.005164275877177715, "step": 5610, "step_time": 10.291837770584971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2238.0, "completions/mean_length": 769.734375, "completions/mean_terminated_length": 451.1250305175781, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.025818794406950472, "epoch": 0.6754807692307693, "frac_reward_zero_std": 0.0, "grad_norm": 0.021114585921168327, "learning_rate": 3.2463942307692303e-07, "loss": 0.0038, "num_tokens": 116041688.0, "reward": 0.8093239068984985, "reward_std": 0.2110903337597847, "rewards/reward_fn/mean": 0.8093239068984985, "rewards/reward_fn/std": 0.2110903486609459, "sampling/importance_sampling_ratio/max": 1.6460285782814026, "sampling/importance_sampling_ratio/mean": 0.5203056335449219, "sampling/importance_sampling_ratio/min": 0.00247401489650656, "sampling/sampling_logp_difference/max": 2.2657148241996765, "sampling/sampling_logp_difference/mean": 0.006460878532379866, "step": 5620, "step_time": 6.717339223995805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2604166666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 2337.0, "completions/mean_length": 1229.03125, "completions/mean_terminated_length": 601.910146077474, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.02417685016989708, "epoch": 0.6766826923076923, "frac_reward_zero_std": 0.0, "grad_norm": 0.010459398850798607, "learning_rate": 3.234375e-07, "loss": -0.0075, "num_tokens": 116255835.0, "reward": 0.7591183384259542, "reward_std": 0.26305092374483746, "rewards/reward_fn/mean": 0.7591183384259542, "rewards/reward_fn/std": 0.26305093864599866, "sampling/importance_sampling_ratio/max": 1.7733415365219116, "sampling/importance_sampling_ratio/mean": 0.3838611940542857, "sampling/importance_sampling_ratio/min": 3.0592050582830176e-05, "sampling/sampling_logp_difference/max": 3.648642142613729, "sampling/sampling_logp_difference/mean": 0.0053887067673106985, "step": 5630, "step_time": 10.296051324158906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3000.0, "completions/max_terminated_length": 2231.5, "completions/mean_length": 1245.65625, "completions/mean_terminated_length": 671.0874328613281, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.023659651353955268, "epoch": 0.6778846153846154, "frac_reward_zero_std": 0.0, "grad_norm": 0.005177112761884928, "learning_rate": 3.222355769230769e-07, "loss": -0.0039, "num_tokens": 116391445.0, "reward": 0.7577330768108368, "reward_std": 0.22974466532468796, "rewards/reward_fn/mean": 0.7577330768108368, "rewards/reward_fn/std": 0.22974465787410736, "sampling/importance_sampling_ratio/max": 1.6517431735992432, "sampling/importance_sampling_ratio/mean": 0.31942807137966156, "sampling/importance_sampling_ratio/min": 7.409416048176354e-05, "sampling/sampling_logp_difference/max": 2.459957242012024, "sampling/sampling_logp_difference/mean": 0.0053722800221294165, "step": 5640, "step_time": 7.111894609779119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2708333333333333, "completions/max_length": 3000.0, "completions/max_terminated_length": 2545.6666666666665, "completions/mean_length": 1235.4895833333333, "completions/mean_terminated_length": 579.5358072916666, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.023530502803623676, "epoch": 0.6790865384615384, "frac_reward_zero_std": 0.0, "grad_norm": 0.011476626619696617, "learning_rate": 3.210336538461538e-07, "loss": 0.0115, "num_tokens": 116629228.0, "reward": 0.7366732358932495, "reward_std": 0.2557235211133957, "rewards/reward_fn/mean": 0.7366732358932495, "rewards/reward_fn/std": 0.2557235012451808, "sampling/importance_sampling_ratio/max": 1.6914750337600708, "sampling/importance_sampling_ratio/mean": 0.394760141770045, "sampling/importance_sampling_ratio/min": 1.9781464961473223e-05, "sampling/sampling_logp_difference/max": 1.6486591498057048, "sampling/sampling_logp_difference/mean": 0.005177922857304414, "step": 5650, "step_time": 10.629603874962777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 3000.0, "completions/max_terminated_length": 2294.0, "completions/mean_length": 1008.15625, "completions/mean_terminated_length": 560.4017944335938, "completions/min_length": 101.5, "completions/min_terminated_length": 101.5, "entropy": 0.02417966928333044, "epoch": 0.6802884615384616, "frac_reward_zero_std": 0.0, "grad_norm": 0.02016427367925644, "learning_rate": 3.1983173076923075e-07, "loss": 0.0031, "num_tokens": 116791270.0, "reward": 0.6637646555900574, "reward_std": 0.28437813371419907, "rewards/reward_fn/mean": 0.6637646555900574, "rewards/reward_fn/std": 0.2843781188130379, "sampling/importance_sampling_ratio/max": 1.5108408331871033, "sampling/importance_sampling_ratio/mean": 0.39669956266880035, "sampling/importance_sampling_ratio/min": 0.00013670455518877134, "sampling/sampling_logp_difference/max": 1.3575817346572876, "sampling/sampling_logp_difference/mean": 0.005580377299338579, "step": 5660, "step_time": 7.257885010447353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3020833333333333, "completions/max_length": 3000.0, "completions/max_terminated_length": 2384.6666666666665, "completions/mean_length": 1316.28125, "completions/mean_terminated_length": 595.8715209960938, "completions/min_length": 123.66666666666667, "completions/min_terminated_length": 123.66666666666667, "entropy": 0.023232377879321576, "epoch": 0.6814903846153846, "frac_reward_zero_std": 0.0, "grad_norm": 0.013450209982693195, "learning_rate": 3.186298076923077e-07, "loss": -0.0153, "num_tokens": 117036793.0, "reward": 0.7254046400388082, "reward_std": 0.2639298389355342, "rewards/reward_fn/mean": 0.7254046400388082, "rewards/reward_fn/std": 0.2639298488696416, "sampling/importance_sampling_ratio/max": 2.016422907511393, "sampling/importance_sampling_ratio/mean": 0.42224739988644916, "sampling/importance_sampling_ratio/min": 3.559321618240574e-05, "sampling/sampling_logp_difference/max": 2.448440909385681, "sampling/sampling_logp_difference/mean": 0.005325324833393097, "step": 5670, "step_time": 10.323535762447865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 3000.0, "completions/max_terminated_length": 1579.5, "completions/mean_length": 1125.59375, "completions/mean_terminated_length": 451.20240783691406, "completions/min_length": 111.5, "completions/min_terminated_length": 111.5, "entropy": 0.025731902942061423, "epoch": 0.6826923076923077, "frac_reward_zero_std": 0.0, "grad_norm": 0.03205043822526932, "learning_rate": 3.1742788461538463e-07, "loss": -0.0035, "num_tokens": 117187839.0, "reward": 0.6438798904418945, "reward_std": 0.320801243185997, "rewards/reward_fn/mean": 0.6438798904418945, "rewards/reward_fn/std": 0.320801243185997, "sampling/importance_sampling_ratio/max": 1.129992127418518, "sampling/importance_sampling_ratio/mean": 0.3738640546798706, "sampling/importance_sampling_ratio/min": 0.00016063467046478763, "sampling/sampling_logp_difference/max": 1.9865968227386475, "sampling/sampling_logp_difference/mean": 0.005430861609056592, "step": 5680, "step_time": 7.164773730002343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.23958333333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 2033.6666666666667, "completions/mean_length": 965.53125, "completions/mean_terminated_length": 315.48967997233075, "completions/min_length": 111.66666666666667, "completions/min_terminated_length": 111.66666666666667, "entropy": 0.021682826895266773, "epoch": 0.6838942307692307, "frac_reward_zero_std": 0.25, "grad_norm": 0.003306974656879902, "learning_rate": 3.162259615384615e-07, "loss": -0.0096, "num_tokens": 117380842.0, "reward": 0.7222851912180582, "reward_std": 0.2342422902584076, "rewards/reward_fn/mean": 0.7222851912180582, "rewards/reward_fn/std": 0.23424228529135385, "sampling/importance_sampling_ratio/max": 2.015679200490316, "sampling/importance_sampling_ratio/mean": 0.5426170527935028, "sampling/importance_sampling_ratio/min": 4.3232478806961204e-05, "sampling/sampling_logp_difference/max": 2.6522485415140786, "sampling/sampling_logp_difference/mean": 0.0047216531820595264, "step": 5690, "step_time": 10.232220260240137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2428.5, "completions/mean_length": 839.9375, "completions/mean_terminated_length": 531.3571624755859, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.02534839157015085, "epoch": 0.6850961538461539, "frac_reward_zero_std": 0.0, "grad_norm": 0.11629132181406021, "learning_rate": 3.1502403846153847e-07, "loss": -0.0009, "num_tokens": 117511710.0, "reward": 0.8146072924137115, "reward_std": 0.21860410273075104, "rewards/reward_fn/mean": 0.8146072924137115, "rewards/reward_fn/std": 0.21860408037900925, "sampling/importance_sampling_ratio/max": 1.9415854215621948, "sampling/importance_sampling_ratio/mean": 0.503687858581543, "sampling/importance_sampling_ratio/min": 2.00506051442062e-05, "sampling/sampling_logp_difference/max": 2.6400267481803894, "sampling/sampling_logp_difference/mean": 0.006134539609774947, "step": 5700, "step_time": 6.98703987095505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.11458333333333333, "completions/max_length": 3000.0, "completions/max_terminated_length": 2083.6666666666665, "completions/mean_length": 736.2916666666666, "completions/mean_terminated_length": 445.9274190266927, "completions/min_length": 137.33333333333334, "completions/min_terminated_length": 137.33333333333334, "entropy": 0.024577100574970246, "epoch": 0.6862980769230769, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.008050142787396908, "learning_rate": 3.138221153846154e-07, "loss": -0.0004, "num_tokens": 117704202.0, "reward": 0.6973151961962382, "reward_std": 0.33705391983191174, "rewards/reward_fn/mean": 0.6973151961962382, "rewards/reward_fn/std": 0.33705390989780426, "sampling/importance_sampling_ratio/max": 1.2427302201588948, "sampling/importance_sampling_ratio/mean": 0.48920251925786334, "sampling/importance_sampling_ratio/min": 6.421552628429102e-05, "sampling/sampling_logp_difference/max": 2.0293107827504477, "sampling/sampling_logp_difference/mean": 0.005485802888870239, "step": 5710, "step_time": 9.796541166305541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 3000.0, "completions/max_terminated_length": 2259.5, "completions/mean_length": 1055.875, "completions/mean_terminated_length": 516.2662353515625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.02415943332016468, "epoch": 0.6875, "frac_reward_zero_std": 0.125, "grad_norm": 0.006039869040250778, "learning_rate": 3.126201923076923e-07, "loss": 0.0065, "num_tokens": 117844994.0, "reward": 0.6324527859687805, "reward_std": 0.33152899146080017, "rewards/reward_fn/mean": 0.6324527859687805, "rewards/reward_fn/std": 0.3315289616584778, "sampling/importance_sampling_ratio/max": 1.6640293598175049, "sampling/importance_sampling_ratio/mean": 0.40748655796051025, "sampling/importance_sampling_ratio/min": 0.00045452221365849255, "sampling/sampling_logp_difference/max": 2.4116860032081604, "sampling/sampling_logp_difference/mean": 0.004768000915646553, "step": 5720, "step_time": 6.998002458829433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 3000.0, "completions/max_terminated_length": 1514.0, "completions/mean_length": 1114.9270833333333, "completions/mean_terminated_length": 378.2483317057292, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.023680008947849274, "epoch": 0.6887019230769231, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.013316715136170387, "learning_rate": 3.114182692307692e-07, "loss": -0.0065, "num_tokens": 118063771.0, "reward": 0.7322263717651367, "reward_std": 0.2446676641702652, "rewards/reward_fn/mean": 0.7322263717651367, "rewards/reward_fn/std": 0.244667649269104, "sampling/importance_sampling_ratio/max": 1.9526769320170085, "sampling/importance_sampling_ratio/mean": 0.4563148319721222, "sampling/importance_sampling_ratio/min": 0.00017549215317558264, "sampling/sampling_logp_difference/max": 3.7969661156336465, "sampling/sampling_logp_difference/mean": 0.005515054489175479, "step": 5730, "step_time": 10.132136875763535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 3000.0, "completions/max_terminated_length": 2277.5, "completions/mean_length": 895.75, "completions/mean_terminated_length": 561.021240234375, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.019334237091243267, "epoch": 0.6899038461538461, "frac_reward_zero_std": 0.25, "grad_norm": 0.01195046678185463, "learning_rate": 3.1021634615384613e-07, "loss": -0.0117, "num_tokens": 118179763.0, "reward": 0.5865126252174377, "reward_std": 0.3902212381362915, "rewards/reward_fn/mean": 0.5865126252174377, "rewards/reward_fn/std": 0.3902212083339691, "sampling/importance_sampling_ratio/max": 2.153870701789856, "sampling/importance_sampling_ratio/mean": 0.5238884389400482, "sampling/importance_sampling_ratio/min": 0.00061633382938453, "sampling/sampling_logp_difference/max": 2.9347407817840576, "sampling/sampling_logp_difference/mean": 0.004178905393928289, "step": 5740, "step_time": 7.014181983377784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.16666666666666666, "completions/max_length": 3000.0, "completions/max_terminated_length": 2317.0, "completions/mean_length": 862.78125, "completions/mean_terminated_length": 455.6312561035156, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.020571821928024293, "epoch": 0.6911057692307693, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.029236502945423126, "learning_rate": 3.09014423076923e-07, "loss": -0.0036, "num_tokens": 118359318.0, "reward": 0.7789009213447571, "reward_std": 0.18005073567231497, "rewards/reward_fn/mean": 0.7789009213447571, "rewards/reward_fn/std": 0.18005073070526123, "sampling/importance_sampling_ratio/max": 1.4398918946584065, "sampling/importance_sampling_ratio/mean": 0.500187337398529, "sampling/importance_sampling_ratio/min": 0.00019011498079635203, "sampling/sampling_logp_difference/max": 1.484641671180725, "sampling/sampling_logp_difference/mean": 0.004896257848789294, "step": 5750, "step_time": 10.11867467975244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3000.0, "completions/max_terminated_length": 1859.5, "completions/mean_length": 1134.65625, "completions/mean_terminated_length": 513.7724914550781, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.021610194630920888, "epoch": 0.6923076923076923, "frac_reward_zero_std": 0.0, "grad_norm": 0.003876987611874938, "learning_rate": 3.0781249999999996e-07, "loss": 0.0002, "num_tokens": 118495944.0, "reward": 0.6904861927032471, "reward_std": 0.29316461831331253, "rewards/reward_fn/mean": 0.6904861927032471, "rewards/reward_fn/std": 0.29316461831331253, "sampling/importance_sampling_ratio/max": 1.5873793959617615, "sampling/importance_sampling_ratio/mean": 0.39754898846149445, "sampling/importance_sampling_ratio/min": 0.00020208716705383267, "sampling/sampling_logp_difference/max": 1.7984607815742493, "sampling/sampling_logp_difference/mean": 0.0049202225636690855, "step": 5760, "step_time": 6.989177733939141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2708333333333333, "completions/max_length": 3000.0, "completions/max_terminated_length": 1691.0, "completions/mean_length": 1173.40625, "completions/mean_terminated_length": 505.77979532877606, "completions/min_length": 152.33333333333334, "completions/min_terminated_length": 152.33333333333334, "entropy": 0.021682186238467693, "epoch": 0.6935096153846154, "frac_reward_zero_std": 0.0, "grad_norm": 0.015431645326316357, "learning_rate": 3.066105769230769e-07, "loss": 0.027, "num_tokens": 118722559.0, "reward": 0.7634401321411133, "reward_std": 0.22755506138006845, "rewards/reward_fn/mean": 0.7634401321411133, "rewards/reward_fn/std": 0.22755506138006845, "sampling/importance_sampling_ratio/max": 1.7438308000564575, "sampling/importance_sampling_ratio/mean": 0.42804282903671265, "sampling/importance_sampling_ratio/min": 5.9401074395282194e-05, "sampling/sampling_logp_difference/max": 1.5495006243387859, "sampling/sampling_logp_difference/mean": 0.004533623422806461, "step": 5770, "step_time": 10.202187035605311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 3000.0, "completions/max_terminated_length": 1940.0, "completions/mean_length": 1169.75, "completions/mean_terminated_length": 606.0384826660156, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.02368293683975935, "epoch": 0.6947115384615384, "frac_reward_zero_std": 0.0, "grad_norm": 0.015948517248034477, "learning_rate": 3.0540865384615385e-07, "loss": 0.0079, "num_tokens": 118866743.0, "reward": 0.7338148951530457, "reward_std": 0.27490830421447754, "rewards/reward_fn/mean": 0.7338148951530457, "rewards/reward_fn/std": 0.27490830421447754, "sampling/importance_sampling_ratio/max": 1.566188633441925, "sampling/importance_sampling_ratio/mean": 0.32032598555088043, "sampling/importance_sampling_ratio/min": 0.00020410778734003543, "sampling/sampling_logp_difference/max": 3.252277135848999, "sampling/sampling_logp_difference/mean": 0.0053365277126431465, "step": 5780, "step_time": 6.860391921550035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 3000.0, "completions/max_terminated_length": 1873.3333333333333, "completions/mean_length": 873.53125, "completions/mean_terminated_length": 474.3965555826823, "completions/min_length": 142.66666666666666, "completions/min_terminated_length": 142.66666666666666, "entropy": 0.027155743166804313, "epoch": 0.6959134615384616, "frac_reward_zero_std": 0.0, "grad_norm": 0.01993592455983162, "learning_rate": 3.0420673076923074e-07, "loss": -0.0056, "num_tokens": 119053362.0, "reward": 0.7617470224698385, "reward_std": 0.2606724699338277, "rewards/reward_fn/mean": 0.7617470224698385, "rewards/reward_fn/std": 0.260672464966774, "sampling/importance_sampling_ratio/max": 1.7978062629699707, "sampling/importance_sampling_ratio/mean": 0.39628152052561444, "sampling/importance_sampling_ratio/min": 4.427303444269152e-05, "sampling/sampling_logp_difference/max": 1.839570164680481, "sampling/sampling_logp_difference/mean": 0.006124569724003474, "step": 5790, "step_time": 9.96362453615293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 3000.0, "completions/max_terminated_length": 2381.0, "completions/mean_length": 1337.375, "completions/mean_terminated_length": 728.8296813964844, "completions/min_length": 132.5, "completions/min_terminated_length": 132.5, "entropy": 0.02504048589617014, "epoch": 0.6971153846153846, "frac_reward_zero_std": 0.0, "grad_norm": 0.008710440248250961, "learning_rate": 3.030048076923077e-07, "loss": 0.0107, "num_tokens": 119204906.0, "reward": 0.7364784181118011, "reward_std": 0.2519259452819824, "rewards/reward_fn/mean": 0.7364784181118011, "rewards/reward_fn/std": 0.25192593038082123, "sampling/importance_sampling_ratio/max": 1.422212839126587, "sampling/importance_sampling_ratio/mean": 0.2567962631583214, "sampling/importance_sampling_ratio/min": 2.467884371526452e-06, "sampling/sampling_logp_difference/max": 2.7017990350723267, "sampling/sampling_logp_difference/mean": 0.0057361233048141, "step": 5800, "step_time": 7.172270654141903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.20833333333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 2346.3333333333335, "completions/mean_length": 1008.5416666666666, "completions/mean_terminated_length": 489.63401285807294, "completions/min_length": 118.33333333333333, "completions/min_terminated_length": 118.33333333333333, "entropy": 0.024551008082926273, "epoch": 0.6983173076923077, "frac_reward_zero_std": 0.0, "grad_norm": 0.032721955329179764, "learning_rate": 3.018028846153846e-07, "loss": -0.0019, "num_tokens": 119414126.0, "reward": 0.7886542081832886, "reward_std": 0.21032026410102844, "rewards/reward_fn/mean": 0.7886542081832886, "rewards/reward_fn/std": 0.21032026410102844, "sampling/importance_sampling_ratio/max": 1.3336173295974731, "sampling/importance_sampling_ratio/mean": 0.4133936365445455, "sampling/importance_sampling_ratio/min": 1.3093674018212672e-05, "sampling/sampling_logp_difference/max": 4.126853823661804, "sampling/sampling_logp_difference/mean": 0.005745795555412769, "step": 5810, "step_time": 10.190442400984466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 3000.0, "completions/max_terminated_length": 1414.5, "completions/mean_length": 853.953125, "completions/mean_terminated_length": 389.38639068603516, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.022568091563880444, "epoch": 0.6995192307692307, "frac_reward_zero_std": 0.0, "grad_norm": 0.015489763580262661, "learning_rate": 3.006009615384615e-07, "loss": -0.0187, "num_tokens": 119531075.0, "reward": 0.7454442977905273, "reward_std": 0.24475381523370743, "rewards/reward_fn/mean": 0.7454442977905273, "rewards/reward_fn/std": 0.24475381523370743, "sampling/importance_sampling_ratio/max": 1.7336345314979553, "sampling/importance_sampling_ratio/mean": 0.5472727119922638, "sampling/importance_sampling_ratio/min": 0.00011961536802118644, "sampling/sampling_logp_difference/max": 1.168292224407196, "sampling/sampling_logp_difference/mean": 0.004835843108594418, "step": 5820, "step_time": 6.7777061986736955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 2177.0, "completions/max_terminated_length": 1347.0, "completions/mean_length": 759.7604166666666, "completions/mean_terminated_length": 356.14055887858075, "completions/min_length": 126.66666666666667, "completions/min_terminated_length": 126.66666666666667, "entropy": 0.018538569286465644, "epoch": 0.7007211538461539, "frac_reward_zero_std": 0.16666666666666666, "grad_norm": 0.056635934859514236, "learning_rate": 2.9939903846153846e-07, "loss": -0.0024, "num_tokens": 119705676.0, "reward": 0.8144581516583761, "reward_std": 0.17705618838469186, "rewards/reward_fn/mean": 0.8144581516583761, "rewards/reward_fn/std": 0.17705620701114336, "sampling/importance_sampling_ratio/max": 1.170849084854126, "sampling/importance_sampling_ratio/mean": 0.4870804349581401, "sampling/importance_sampling_ratio/min": 0.0144733365088238, "sampling/sampling_logp_difference/max": 2.1879332065582275, "sampling/sampling_logp_difference/mean": 0.004687903448939323, "step": 5830, "step_time": 7.408830221090466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 3000.0, "completions/max_terminated_length": 1943.5, "completions/mean_length": 1235.25, "completions/mean_terminated_length": 558.0381011962891, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.022869559377431868, "epoch": 0.7019230769230769, "frac_reward_zero_std": 0.0, "grad_norm": 0.022646617144346237, "learning_rate": 2.981971153846154e-07, "loss": 0.0045, "num_tokens": 119850828.0, "reward": 0.7647075057029724, "reward_std": 0.23689565807580948, "rewards/reward_fn/mean": 0.7647075057029724, "rewards/reward_fn/std": 0.23689565062522888, "sampling/importance_sampling_ratio/max": 1.3400092720985413, "sampling/importance_sampling_ratio/mean": 0.2944568246603012, "sampling/importance_sampling_ratio/min": 9.901939480982946e-05, "sampling/sampling_logp_difference/max": 2.8853038549423218, "sampling/sampling_logp_difference/mean": 0.0061653731390833855, "step": 5840, "step_time": 7.114811348356307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 3000.0, "completions/max_terminated_length": 1886.6666666666667, "completions/mean_length": 854.3645833333334, "completions/mean_terminated_length": 370.4395802815755, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.026789778284728526, "epoch": 0.703125, "frac_reward_zero_std": 0.0, "grad_norm": 0.024331582710146904, "learning_rate": 2.9699519230769234e-07, "loss": -0.0004, "num_tokens": 120036951.0, "reward": 0.7888952294985453, "reward_std": 0.20567622284094492, "rewards/reward_fn/mean": 0.7888952294985453, "rewards/reward_fn/std": 0.2056762178738912, "sampling/importance_sampling_ratio/max": 2.1105496486028037, "sampling/importance_sampling_ratio/mean": 0.47533078988393146, "sampling/importance_sampling_ratio/min": 2.117532494594343e-05, "sampling/sampling_logp_difference/max": 1.4598786036173503, "sampling/sampling_logp_difference/mean": 0.0056447130627930164, "step": 5850, "step_time": 10.189264274202287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 3000.0, "completions/max_terminated_length": 1795.5, "completions/mean_length": 1434.15625, "completions/mean_terminated_length": 521.0061340332031, "completions/min_length": 117.5, "completions/min_terminated_length": 117.5, "entropy": 0.021663465164601804, "epoch": 0.7043269230769231, "frac_reward_zero_std": 0.125, "grad_norm": 0.0018784079002216458, "learning_rate": 2.957932692307692e-07, "loss": -0.0017, "num_tokens": 120196993.0, "reward": 0.6508059203624725, "reward_std": 0.3156230077147484, "rewards/reward_fn/mean": 0.6508059203624725, "rewards/reward_fn/std": 0.3156230002641678, "sampling/importance_sampling_ratio/max": 1.6785475611686707, "sampling/importance_sampling_ratio/mean": 0.38526542484760284, "sampling/importance_sampling_ratio/min": 4.14877704315586e-05, "sampling/sampling_logp_difference/max": 2.9792429208755493, "sampling/sampling_logp_difference/mean": 0.004628902766853571, "step": 5860, "step_time": 7.080636318400503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 3000.0, "completions/max_terminated_length": 1212.3333333333333, "completions/mean_length": 769.8645833333334, "completions/mean_terminated_length": 374.54933166503906, "completions/min_length": 141.66666666666666, "completions/min_terminated_length": 141.66666666666666, "entropy": 0.02036378225311637, "epoch": 0.7055288461538461, "frac_reward_zero_std": 0.16666666666666666, "grad_norm": 0.08740223199129105, "learning_rate": 2.945913461538461e-07, "loss": -0.0165, "num_tokens": 120381700.0, "reward": 0.6678885221481323, "reward_std": 0.34972549478212994, "rewards/reward_fn/mean": 0.6678885221481323, "rewards/reward_fn/std": 0.34972548484802246, "sampling/importance_sampling_ratio/max": 2.25439182917277, "sampling/importance_sampling_ratio/mean": 0.5425624251365662, "sampling/importance_sampling_ratio/min": 6.175821946878084e-05, "sampling/sampling_logp_difference/max": 3.678319573402405, "sampling/sampling_logp_difference/mean": 0.005121384747326374, "step": 5870, "step_time": 9.852112903725356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2229.0, "completions/mean_length": 1298.25, "completions/mean_terminated_length": 632.3478393554688, "completions/min_length": 122.5, "completions/min_terminated_length": 122.5, "entropy": 0.025258092768490316, "epoch": 0.7067307692307693, "frac_reward_zero_std": 0.0, "grad_norm": 0.006043012719601393, "learning_rate": 2.9338942307692307e-07, "loss": -0.0087, "num_tokens": 120538004.0, "reward": 0.7622282803058624, "reward_std": 0.21972207725048065, "rewards/reward_fn/mean": 0.7622282803058624, "rewards/reward_fn/std": 0.21972207725048065, "sampling/importance_sampling_ratio/max": 1.9052968621253967, "sampling/importance_sampling_ratio/mean": 0.3403141349554062, "sampling/importance_sampling_ratio/min": 1.0071486940432806e-05, "sampling/sampling_logp_difference/max": 1.7370696067810059, "sampling/sampling_logp_difference/mean": 0.0057585337199270725, "step": 5880, "step_time": 7.063450985681266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 3000.0, "completions/max_terminated_length": 1226.0, "completions/mean_length": 616.78125, "completions/mean_terminated_length": 371.2890930175781, "completions/min_length": 132.66666666666666, "completions/min_terminated_length": 132.66666666666666, "entropy": 0.02668981608003378, "epoch": 0.7079326923076923, "frac_reward_zero_std": 0.0, "grad_norm": 0.015663154423236847, "learning_rate": 2.9218749999999996e-07, "loss": -0.0269, "num_tokens": 120704079.0, "reward": 0.7107234398523966, "reward_std": 0.25566130379835766, "rewards/reward_fn/mean": 0.7107234398523966, "rewards/reward_fn/std": 0.2556613087654114, "sampling/importance_sampling_ratio/max": 2.0441368420918784, "sampling/importance_sampling_ratio/mean": 0.5438246528307596, "sampling/importance_sampling_ratio/min": 0.0004161785063795757, "sampling/sampling_logp_difference/max": 1.6593326727549236, "sampling/sampling_logp_difference/mean": 0.005493022346248229, "step": 5890, "step_time": 9.910383229516446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2197.5, "completions/mean_length": 1177.9375, "completions/mean_terminated_length": 630.5058746337891, "completions/min_length": 156.5, "completions/min_terminated_length": 156.5, "entropy": 0.020860868878662585, "epoch": 0.7091346153846154, "frac_reward_zero_std": 0.0, "grad_norm": 0.007778140716254711, "learning_rate": 2.909855769230769e-07, "loss": -0.0023, "num_tokens": 120844563.0, "reward": 0.7124910354614258, "reward_std": 0.263979896903038, "rewards/reward_fn/mean": 0.7124910354614258, "rewards/reward_fn/std": 0.263979896903038, "sampling/importance_sampling_ratio/max": 1.549231231212616, "sampling/importance_sampling_ratio/mean": 0.42745283246040344, "sampling/importance_sampling_ratio/min": 1.444473241463129e-05, "sampling/sampling_logp_difference/max": 1.9192904233932495, "sampling/sampling_logp_difference/mean": 0.005799962440505624, "step": 5900, "step_time": 7.097004382684827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 3000.0, "completions/max_terminated_length": 1947.6666666666667, "completions/mean_length": 1245.2291666666667, "completions/mean_terminated_length": 562.6989542643229, "completions/min_length": 141.66666666666666, "completions/min_terminated_length": 141.66666666666666, "entropy": 0.02191565092653036, "epoch": 0.7103365384615384, "frac_reward_zero_std": 0.0, "grad_norm": 0.018306855112314224, "learning_rate": 2.8978365384615384e-07, "loss": -0.0071, "num_tokens": 121096257.0, "reward": 0.6989591519037882, "reward_std": 0.25667667388916016, "rewards/reward_fn/mean": 0.6989591519037882, "rewards/reward_fn/std": 0.25667666892210644, "sampling/importance_sampling_ratio/max": 1.7631344000498455, "sampling/importance_sampling_ratio/mean": 0.3741202652454376, "sampling/importance_sampling_ratio/min": 6.793175180064281e-05, "sampling/sampling_logp_difference/max": 2.2543716430664062, "sampling/sampling_logp_difference/mean": 0.005148558101306359, "step": 5910, "step_time": 10.487626797333359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 3000.0, "completions/max_terminated_length": 1410.0, "completions/mean_length": 746.9375, "completions/mean_terminated_length": 384.4104919433594, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.020957145281136035, "epoch": 0.7115384615384616, "frac_reward_zero_std": 0.125, "grad_norm": 0.016643628478050232, "learning_rate": 2.885817307692308e-07, "loss": 0.0039, "num_tokens": 121204917.0, "reward": 0.6874813437461853, "reward_std": 0.34093253314495087, "rewards/reward_fn/mean": 0.6874813437461853, "rewards/reward_fn/std": 0.34093254804611206, "sampling/importance_sampling_ratio/max": 1.272317349910736, "sampling/importance_sampling_ratio/mean": 0.49306556582450867, "sampling/importance_sampling_ratio/min": 4.421926405484555e-06, "sampling/sampling_logp_difference/max": 4.61320161819458, "sampling/sampling_logp_difference/mean": 0.004401404410600662, "step": 5920, "step_time": 6.762449546158313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2604166666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 1674.6666666666667, "completions/mean_length": 1132.3854166666667, "completions/mean_terminated_length": 504.8073018391927, "completions/min_length": 119.33333333333333, "completions/min_terminated_length": 119.33333333333333, "entropy": 0.023671055678278208, "epoch": 0.7127403846153846, "frac_reward_zero_std": 0.0, "grad_norm": 0.0038852563593536615, "learning_rate": 2.8737980769230767e-07, "loss": -0.0002, "num_tokens": 121432738.0, "reward": 0.6484651366869608, "reward_std": 0.32071038087209064, "rewards/reward_fn/mean": 0.6484651366869608, "rewards/reward_fn/std": 0.32071038087209064, "sampling/importance_sampling_ratio/max": 1.3492192029953003, "sampling/importance_sampling_ratio/mean": 0.39349159598350525, "sampling/importance_sampling_ratio/min": 4.484308302081056e-05, "sampling/sampling_logp_difference/max": 4.9673943519592285, "sampling/sampling_logp_difference/mean": 0.005181093234568834, "step": 5930, "step_time": 10.247431121766567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2364.5, "completions/mean_length": 1174.453125, "completions/mean_terminated_length": 711.147705078125, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.021362774632871152, "epoch": 0.7139423076923077, "frac_reward_zero_std": 0.0, "grad_norm": 0.004459871910512447, "learning_rate": 2.861778846153846e-07, "loss": -0.0052, "num_tokens": 121573551.0, "reward": 0.767259031534195, "reward_std": 0.22714998573064804, "rewards/reward_fn/mean": 0.767259031534195, "rewards/reward_fn/std": 0.22714998573064804, "sampling/importance_sampling_ratio/max": 1.652427852153778, "sampling/importance_sampling_ratio/mean": 0.37447531521320343, "sampling/importance_sampling_ratio/min": 1.4747050727237365e-05, "sampling/sampling_logp_difference/max": 2.5360831022262573, "sampling/sampling_logp_difference/mean": 0.005057352129369974, "step": 5940, "step_time": 7.164101294707507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.19791666666666666, "completions/max_length": 2299.0, "completions/max_terminated_length": 2089.6666666666665, "completions/mean_length": 968.5104166666666, "completions/mean_terminated_length": 491.98375447591144, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.02143734395503998, "epoch": 0.7151442307692307, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.0360000915825367, "learning_rate": 2.8497596153846156e-07, "loss": -0.0027, "num_tokens": 121768688.0, "reward": 0.7111457188924154, "reward_std": 0.281184454758962, "rewards/reward_fn/mean": 0.7111457188924154, "rewards/reward_fn/std": 0.28118446966012317, "sampling/importance_sampling_ratio/max": 1.785219430923462, "sampling/importance_sampling_ratio/mean": 0.5183273156483968, "sampling/importance_sampling_ratio/min": 0.021412741004799802, "sampling/sampling_logp_difference/max": 1.6454602479934692, "sampling/sampling_logp_difference/mean": 0.005312304167697827, "step": 5950, "step_time": 7.873716903850436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 3000.0, "completions/max_terminated_length": 1515.5, "completions/mean_length": 1328.90625, "completions/mean_terminated_length": 579.745361328125, "completions/min_length": 142.5, "completions/min_terminated_length": 142.5, "entropy": 0.02305910289287567, "epoch": 0.7163461538461539, "frac_reward_zero_std": 0.125, "grad_norm": 0.009603442624211311, "learning_rate": 2.8377403846153845e-07, "loss": -0.0015, "num_tokens": 121909226.0, "reward": 0.7345192432403564, "reward_std": 0.24273282289505005, "rewards/reward_fn/mean": 0.7345192432403564, "rewards/reward_fn/std": 0.24273283034563065, "sampling/importance_sampling_ratio/max": 1.2092341184616089, "sampling/importance_sampling_ratio/mean": 0.4265051782131195, "sampling/importance_sampling_ratio/min": 0.0004896382306469604, "sampling/sampling_logp_difference/max": 2.8953969478607178, "sampling/sampling_logp_difference/mean": 0.005073888227343559, "step": 5960, "step_time": 7.018742381595075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.14583333333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 2332.3333333333335, "completions/mean_length": 934.3125, "completions/mean_terminated_length": 590.0348205566406, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.024967914260923863, "epoch": 0.7175480769230769, "frac_reward_zero_std": 0.0, "grad_norm": 0.0052222320809960365, "learning_rate": 2.825721153846154e-07, "loss": 0.0056, "num_tokens": 122101064.0, "reward": 0.7785837650299072, "reward_std": 0.23508720099925995, "rewards/reward_fn/mean": 0.7785837650299072, "rewards/reward_fn/std": 0.2350871960322062, "sampling/importance_sampling_ratio/max": 1.1952960689862568, "sampling/importance_sampling_ratio/mean": 0.36806486050287884, "sampling/importance_sampling_ratio/min": 1.884708823733187e-05, "sampling/sampling_logp_difference/max": 2.1636796792348227, "sampling/sampling_logp_difference/mean": 0.005518451953927676, "step": 5970, "step_time": 10.10549679948017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 3000.0, "completions/max_terminated_length": 1701.0, "completions/mean_length": 1110.71875, "completions/mean_terminated_length": 434.1268615722656, "completions/min_length": 151.5, "completions/min_terminated_length": 151.5, "entropy": 0.026529455184936525, "epoch": 0.71875, "frac_reward_zero_std": 0.0, "grad_norm": 0.005157829727977514, "learning_rate": 2.8137019230769233e-07, "loss": 0.0116, "num_tokens": 122235542.0, "reward": 0.7453947365283966, "reward_std": 0.2336622253060341, "rewards/reward_fn/mean": 0.7453947365283966, "rewards/reward_fn/std": 0.2336622104048729, "sampling/importance_sampling_ratio/max": 1.8808993101119995, "sampling/importance_sampling_ratio/mean": 0.4065973460674286, "sampling/importance_sampling_ratio/min": 8.241177783929743e-05, "sampling/sampling_logp_difference/max": 2.4013842940330505, "sampling/sampling_logp_difference/mean": 0.0055973154958337545, "step": 5980, "step_time": 6.824440079927444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.22916666666666666, "completions/max_length": 3000.0, "completions/max_terminated_length": 2329.3333333333335, "completions/mean_length": 1158.875, "completions/mean_terminated_length": 632.1304524739584, "completions/min_length": 124.33333333333333, "completions/min_terminated_length": 124.33333333333333, "entropy": 0.02571730799973011, "epoch": 0.7199519230769231, "frac_reward_zero_std": 0.0, "grad_norm": 0.019024064764380455, "learning_rate": 2.8016826923076917e-07, "loss": -0.0042, "num_tokens": 122442170.0, "reward": 0.72914719581604, "reward_std": 0.2507435530424118, "rewards/reward_fn/mean": 0.72914719581604, "rewards/reward_fn/std": 0.2507435331741969, "sampling/importance_sampling_ratio/max": 1.9370033343633015, "sampling/importance_sampling_ratio/mean": 0.39417048295338947, "sampling/importance_sampling_ratio/min": 0.0005204965515683094, "sampling/sampling_logp_difference/max": 1.3500739733378093, "sampling/sampling_logp_difference/mean": 0.005252715510626634, "step": 5990, "step_time": 10.275085000973196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 3000.0, "completions/max_terminated_length": 1630.0, "completions/mean_length": 696.90625, "completions/mean_terminated_length": 316.3640670776367, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.023802987486124038, "epoch": 0.7211538461538461, "frac_reward_zero_std": 0.0, "grad_norm": 0.01789058744907379, "learning_rate": 2.789663461538461e-07, "loss": -0.0128, "num_tokens": 122564116.0, "reward": 0.8255707025527954, "reward_std": 0.17041092365980148, "rewards/reward_fn/mean": 0.8255707025527954, "rewards/reward_fn/std": 0.1704109162092209, "sampling/importance_sampling_ratio/max": 1.6328119039535522, "sampling/importance_sampling_ratio/mean": 0.6372896730899811, "sampling/importance_sampling_ratio/min": 4.5828930524294265e-05, "sampling/sampling_logp_difference/max": 1.8959705829620361, "sampling/sampling_logp_difference/mean": 0.005278675118461251, "step": 6000, "step_time": 6.940105112548918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3229166666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 2399.3333333333335, "completions/mean_length": 1311.1875, "completions/mean_terminated_length": 523.44970703125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.021815344877541065, "epoch": 0.7223557692307693, "frac_reward_zero_std": 0.0, "grad_norm": 0.012110264971852303, "learning_rate": 2.7776442307692306e-07, "loss": -0.0084, "num_tokens": 122800462.0, "reward": 0.6604705452919006, "reward_std": 0.29904887576897937, "rewards/reward_fn/mean": 0.6604705452919006, "rewards/reward_fn/std": 0.29904886583487195, "sampling/importance_sampling_ratio/max": 1.5241968631744385, "sampling/importance_sampling_ratio/mean": 0.39984099566936493, "sampling/importance_sampling_ratio/min": 0.00020576672007640204, "sampling/sampling_logp_difference/max": 4.045389016469319, "sampling/sampling_logp_difference/mean": 0.005187759796778361, "step": 6010, "step_time": 10.181650455109775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2110.5, "completions/mean_length": 1034.296875, "completions/mean_terminated_length": 438.56060791015625, "completions/min_length": 120.5, "completions/min_terminated_length": 120.5, "entropy": 0.024363514967262745, "epoch": 0.7235576923076923, "frac_reward_zero_std": 0.125, "grad_norm": 0.0038336487486958504, "learning_rate": 2.765625e-07, "loss": -0.01, "num_tokens": 122948665.0, "reward": 0.7594738900661469, "reward_std": 0.2226114422082901, "rewards/reward_fn/mean": 0.7594738900661469, "rewards/reward_fn/std": 0.2226114422082901, "sampling/importance_sampling_ratio/max": 1.94916433095932, "sampling/importance_sampling_ratio/mean": 0.3930211365222931, "sampling/importance_sampling_ratio/min": 4.9697976692186785e-05, "sampling/sampling_logp_difference/max": 1.7916730046272278, "sampling/sampling_logp_difference/mean": 0.005251530557870865, "step": 6020, "step_time": 6.876651938259601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2708333333333333, "completions/max_length": 3000.0, "completions/max_terminated_length": 2211.0, "completions/mean_length": 1268.90625, "completions/mean_terminated_length": 638.5635070800781, "completions/min_length": 141.66666666666666, "completions/min_terminated_length": 141.66666666666666, "entropy": 0.02436906099319458, "epoch": 0.7247596153846154, "frac_reward_zero_std": 0.0, "grad_norm": 0.024060731753706932, "learning_rate": 2.753605769230769e-07, "loss": 0.0097, "num_tokens": 123227296.0, "reward": 0.718111534913381, "reward_std": 0.2795058687527974, "rewards/reward_fn/mean": 0.718111534913381, "rewards/reward_fn/std": 0.2795058786869049, "sampling/importance_sampling_ratio/max": 1.5164294242858887, "sampling/importance_sampling_ratio/mean": 0.3047575503587723, "sampling/importance_sampling_ratio/min": 1.72150058688203e-05, "sampling/sampling_logp_difference/max": 5.177751620610555, "sampling/sampling_logp_difference/mean": 0.005868974141776562, "step": 6030, "step_time": 10.686313719768076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 3000.0, "completions/max_terminated_length": 1806.0, "completions/mean_length": 864.296875, "completions/mean_terminated_length": 416.8442840576172, "completions/min_length": 121.5, "completions/min_terminated_length": 121.5, "entropy": 0.02792499344795942, "epoch": 0.7259615384615384, "frac_reward_zero_std": 0.0, "grad_norm": 0.014189459383487701, "learning_rate": 2.7415865384615383e-07, "loss": -0.008, "num_tokens": 123353571.0, "reward": 0.7499031722545624, "reward_std": 0.24117495119571686, "rewards/reward_fn/mean": 0.7499031722545624, "rewards/reward_fn/std": 0.24117494374513626, "sampling/importance_sampling_ratio/max": 1.5250192284584045, "sampling/importance_sampling_ratio/mean": 0.3622605949640274, "sampling/importance_sampling_ratio/min": 0.0003256580785091501, "sampling/sampling_logp_difference/max": 1.7174392938613892, "sampling/sampling_logp_difference/mean": 0.006534783868119121, "step": 6040, "step_time": 6.883140973281115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.13541666666666666, "completions/max_length": 3000.0, "completions/max_terminated_length": 1501.0, "completions/mean_length": 808.1979166666666, "completions/mean_terminated_length": 477.759760538737, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.025251895003020764, "epoch": 0.7271634615384616, "frac_reward_zero_std": 0.0, "grad_norm": 0.043305493891239166, "learning_rate": 2.729567307692308e-07, "loss": -0.0095, "num_tokens": 123550894.0, "reward": 0.7739616831143697, "reward_std": 0.2569871296485265, "rewards/reward_fn/mean": 0.7739616831143697, "rewards/reward_fn/std": 0.2569871296485265, "sampling/importance_sampling_ratio/max": 1.6057082414627075, "sampling/importance_sampling_ratio/mean": 0.44391919175783795, "sampling/importance_sampling_ratio/min": 7.68661725869535e-06, "sampling/sampling_logp_difference/max": 4.53795079390208, "sampling/sampling_logp_difference/mean": 0.005844858319809039, "step": 6050, "step_time": 9.95870946822688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 3000.0, "completions/max_terminated_length": 2153.0, "completions/mean_length": 1137.3125, "completions/mean_terminated_length": 617.2711181640625, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.024666242860257627, "epoch": 0.7283653846153846, "frac_reward_zero_std": 0.0, "grad_norm": 0.012890190817415714, "learning_rate": 2.7175480769230766e-07, "loss": -0.0023, "num_tokens": 123686338.0, "reward": 0.7819849848747253, "reward_std": 0.22673888504505157, "rewards/reward_fn/mean": 0.7819849848747253, "rewards/reward_fn/std": 0.22673886269330978, "sampling/importance_sampling_ratio/max": 1.6959571838378906, "sampling/importance_sampling_ratio/mean": 0.3469923138618469, "sampling/importance_sampling_ratio/min": 3.6242956412024796e-05, "sampling/sampling_logp_difference/max": 1.2375773787498474, "sampling/sampling_logp_difference/mean": 0.004976677475497127, "step": 6060, "step_time": 7.024133733753115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.23958333333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 1839.3333333333333, "completions/mean_length": 995.8645833333334, "completions/mean_terminated_length": 362.36231486002606, "completions/min_length": 95.66666666666667, "completions/min_terminated_length": 95.66666666666667, "entropy": 0.023170528747141362, "epoch": 0.7295673076923077, "frac_reward_zero_std": 0.0, "grad_norm": 0.013697778806090355, "learning_rate": 2.705528846153846e-07, "loss": -0.0116, "num_tokens": 123882301.0, "reward": 0.7793773810068766, "reward_std": 0.21857539812723795, "rewards/reward_fn/mean": 0.7793773810068766, "rewards/reward_fn/std": 0.2185753881931305, "sampling/importance_sampling_ratio/max": 1.5266101757685344, "sampling/importance_sampling_ratio/mean": 0.49963430563608807, "sampling/importance_sampling_ratio/min": 2.8950697924301494e-05, "sampling/sampling_logp_difference/max": 3.5882078806559243, "sampling/sampling_logp_difference/mean": 0.0055504545258979006, "step": 6070, "step_time": 10.302473663724959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2570.0, "completions/mean_length": 1794.890625, "completions/mean_terminated_length": 796.6192932128906, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.024718627519905566, "epoch": 0.7307692307692307, "frac_reward_zero_std": 0.0, "grad_norm": 0.005880368407815695, "learning_rate": 2.6935096153846155e-07, "loss": -0.0045, "num_tokens": 124087462.0, "reward": 0.6470432579517365, "reward_std": 0.2651350349187851, "rewards/reward_fn/mean": 0.6470432579517365, "rewards/reward_fn/std": 0.2651350349187851, "sampling/importance_sampling_ratio/max": 1.2462676763534546, "sampling/importance_sampling_ratio/mean": 0.19697962701320648, "sampling/importance_sampling_ratio/min": 5.469982852446265e-06, "sampling/sampling_logp_difference/max": 2.675572156906128, "sampling/sampling_logp_difference/mean": 0.0053839769680053, "step": 6080, "step_time": 7.340327640809119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.20833333333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 1952.0, "completions/mean_length": 1065.5104166666667, "completions/mean_terminated_length": 540.8132731119791, "completions/min_length": 126.33333333333333, "completions/min_terminated_length": 126.33333333333333, "entropy": 0.021974716708064078, "epoch": 0.7319711538461539, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.011942893266677856, "learning_rate": 2.681490384615385e-07, "loss": 0.0165, "num_tokens": 124298175.0, "reward": 0.6956712007522583, "reward_std": 0.28730973104635876, "rewards/reward_fn/mean": 0.6956712007522583, "rewards/reward_fn/std": 0.2873097211122513, "sampling/importance_sampling_ratio/max": 1.6242293914159138, "sampling/importance_sampling_ratio/mean": 0.39623693625132245, "sampling/importance_sampling_ratio/min": 5.041100272743885e-05, "sampling/sampling_logp_difference/max": 2.7552411953608194, "sampling/sampling_logp_difference/mean": 0.004555399219195048, "step": 6090, "step_time": 10.01151556642726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 3000.0, "completions/max_terminated_length": 2743.0, "completions/mean_length": 935.296875, "completions/mean_terminated_length": 552.9444427490234, "completions/min_length": 135.5, "completions/min_terminated_length": 135.5, "entropy": 0.025215313769876957, "epoch": 0.7331730769230769, "frac_reward_zero_std": 0.0, "grad_norm": 0.0065670982003211975, "learning_rate": 2.669471153846154e-07, "loss": 0.0005, "num_tokens": 124428818.0, "reward": 0.7034922242164612, "reward_std": 0.2880922108888626, "rewards/reward_fn/mean": 0.7034922242164612, "rewards/reward_fn/std": 0.288092203438282, "sampling/importance_sampling_ratio/max": 1.817260056734085, "sampling/importance_sampling_ratio/mean": 0.3942994624376297, "sampling/importance_sampling_ratio/min": 2.3441678422386758e-05, "sampling/sampling_logp_difference/max": 1.620371401309967, "sampling/sampling_logp_difference/mean": 0.005924354773014784, "step": 6100, "step_time": 7.237138087395579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.19791666666666666, "completions/max_length": 3000.0, "completions/max_terminated_length": 2326.6666666666665, "completions/mean_length": 1023.2604166666666, "completions/mean_terminated_length": 538.9712829589844, "completions/min_length": 146.66666666666666, "completions/min_terminated_length": 146.66666666666666, "entropy": 0.02194778695702553, "epoch": 0.734375, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.005345981102436781, "learning_rate": 2.657451923076923e-07, "loss": 0.0142, "num_tokens": 124627867.0, "reward": 0.7147150635719299, "reward_std": 0.29200051724910736, "rewards/reward_fn/mean": 0.7147150635719299, "rewards/reward_fn/std": 0.29200052718321484, "sampling/importance_sampling_ratio/max": 1.3430152138074238, "sampling/importance_sampling_ratio/mean": 0.4142563094695409, "sampling/importance_sampling_ratio/min": 0.014269497604497397, "sampling/sampling_logp_difference/max": 1.6011621554692586, "sampling/sampling_logp_difference/mean": 0.0045472080043206615, "step": 6110, "step_time": 10.143693393841385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 3000.0, "completions/max_terminated_length": 1638.0, "completions/mean_length": 1299.90625, "completions/mean_terminated_length": 458.61058044433594, "completions/min_length": 148.5, "completions/min_terminated_length": 148.5, "entropy": 0.019987412542104722, "epoch": 0.7355769230769231, "frac_reward_zero_std": 0.125, "grad_norm": 0.0030258495826274157, "learning_rate": 2.645432692307692e-07, "loss": 0.0037, "num_tokens": 124787037.0, "reward": 0.6012287735939026, "reward_std": 0.3725048005580902, "rewards/reward_fn/mean": 0.6012287735939026, "rewards/reward_fn/std": 0.3725048005580902, "sampling/importance_sampling_ratio/max": 1.0320052802562714, "sampling/importance_sampling_ratio/mean": 0.39952678233385086, "sampling/importance_sampling_ratio/min": 0.0002784519056149293, "sampling/sampling_logp_difference/max": 2.1242016553878784, "sampling/sampling_logp_difference/mean": 0.004295114427804947, "step": 6120, "step_time": 7.288082194980234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.14583333333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 1779.0, "completions/mean_length": 909.8958333333334, "completions/mean_terminated_length": 550.7949727376302, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.027668667770922185, "epoch": 0.7367788461538461, "frac_reward_zero_std": 0.0, "grad_norm": 0.03417080640792847, "learning_rate": 2.633413461538461e-07, "loss": -0.0098, "num_tokens": 124980291.0, "reward": 0.8055042028427124, "reward_std": 0.21279531717300415, "rewards/reward_fn/mean": 0.8055042028427124, "rewards/reward_fn/std": 0.21279530227184296, "sampling/importance_sampling_ratio/max": 1.384001652399699, "sampling/importance_sampling_ratio/mean": 0.414599488178889, "sampling/importance_sampling_ratio/min": 1.8265356933018968e-05, "sampling/sampling_logp_difference/max": 1.3486584424972534, "sampling/sampling_logp_difference/mean": 0.006097461407383283, "step": 6130, "step_time": 10.275199847389013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 3000.0, "completions/max_terminated_length": 1630.0, "completions/mean_length": 1493.328125, "completions/mean_terminated_length": 704.1190795898438, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.02344862837344408, "epoch": 0.7379807692307693, "frac_reward_zero_std": 0.0, "grad_norm": 0.01721375808119774, "learning_rate": 2.6213942307692305e-07, "loss": -0.0027, "num_tokens": 125156024.0, "reward": 0.689859002828598, "reward_std": 0.2694249823689461, "rewards/reward_fn/mean": 0.689859002828598, "rewards/reward_fn/std": 0.2694249749183655, "sampling/importance_sampling_ratio/max": 1.5999773144721985, "sampling/importance_sampling_ratio/mean": 0.22401005029678345, "sampling/importance_sampling_ratio/min": 2.7282299015496392e-05, "sampling/sampling_logp_difference/max": 3.6406957507133484, "sampling/sampling_logp_difference/mean": 0.005549831781536341, "step": 6140, "step_time": 7.0424317584373055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.22916666666666666, "completions/max_length": 3000.0, "completions/max_terminated_length": 1958.0, "completions/mean_length": 1064.8541666666667, "completions/mean_terminated_length": 493.72430419921875, "completions/min_length": 128.33333333333334, "completions/min_terminated_length": 128.33333333333334, "entropy": 0.026604149863123892, "epoch": 0.7391826923076923, "frac_reward_zero_std": 0.0, "grad_norm": 0.004645763896405697, "learning_rate": 2.609375e-07, "loss": -0.0072, "num_tokens": 125370146.0, "reward": 0.7465298175811768, "reward_std": 0.2661922872066498, "rewards/reward_fn/mean": 0.7465298175811768, "rewards/reward_fn/std": 0.2661922772725423, "sampling/importance_sampling_ratio/max": 2.208806316057841, "sampling/importance_sampling_ratio/mean": 0.42961488167444867, "sampling/importance_sampling_ratio/min": 3.928758390732886e-05, "sampling/sampling_logp_difference/max": 1.6852229436238606, "sampling/sampling_logp_difference/mean": 0.005402886463950078, "step": 6150, "step_time": 10.007628090959042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 3000.0, "completions/max_terminated_length": 1283.0, "completions/mean_length": 761.515625, "completions/mean_terminated_length": 346.9814758300781, "completions/min_length": 98.5, "completions/min_terminated_length": 98.5, "entropy": 0.026571783050894738, "epoch": 0.7403846153846154, "frac_reward_zero_std": 0.0, "grad_norm": 0.01103800255805254, "learning_rate": 2.597355769230769e-07, "loss": -0.0098, "num_tokens": 125506995.0, "reward": 0.8128134608268738, "reward_std": 0.20934384316205978, "rewards/reward_fn/mean": 0.8128134608268738, "rewards/reward_fn/std": 0.2093438357114792, "sampling/importance_sampling_ratio/max": 1.5089675188064575, "sampling/importance_sampling_ratio/mean": 0.4826699048280716, "sampling/importance_sampling_ratio/min": 0.0009016062540467829, "sampling/sampling_logp_difference/max": 2.5069574117660522, "sampling/sampling_logp_difference/mean": 0.005709479562938213, "step": 6160, "step_time": 7.012219895701856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 3000.0, "completions/max_terminated_length": 1888.6666666666667, "completions/mean_length": 918.5833333333334, "completions/mean_terminated_length": 541.5695292154948, "completions/min_length": 141.33333333333334, "completions/min_terminated_length": 141.33333333333334, "entropy": 0.024767768755555152, "epoch": 0.7415865384615384, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.004027215763926506, "learning_rate": 2.585336538461538e-07, "loss": -0.0032, "num_tokens": 125702211.0, "reward": 0.8093701799710592, "reward_std": 0.2061161051193873, "rewards/reward_fn/mean": 0.8093701799710592, "rewards/reward_fn/std": 0.20611611505349478, "sampling/importance_sampling_ratio/max": 1.8227382898330688, "sampling/importance_sampling_ratio/mean": 0.4572509129842122, "sampling/importance_sampling_ratio/min": 0.00017502542808263874, "sampling/sampling_logp_difference/max": 2.094012419382731, "sampling/sampling_logp_difference/mean": 0.0054199475174148875, "step": 6170, "step_time": 10.209660891722887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 3000.0, "completions/max_terminated_length": 2886.0, "completions/mean_length": 996.640625, "completions/mean_terminated_length": 620.1222839355469, "completions/min_length": 134.5, "completions/min_terminated_length": 134.5, "entropy": 0.02470441535115242, "epoch": 0.7427884615384616, "frac_reward_zero_std": 0.0, "grad_norm": 0.03162581846117973, "learning_rate": 2.5733173076923077e-07, "loss": -0.01, "num_tokens": 125831612.0, "reward": 0.7643669247627258, "reward_std": 0.22276607155799866, "rewards/reward_fn/mean": 0.7643669247627258, "rewards/reward_fn/std": 0.22276606410741806, "sampling/importance_sampling_ratio/max": 1.4830988049507141, "sampling/importance_sampling_ratio/mean": 0.44028596580028534, "sampling/importance_sampling_ratio/min": 4.932718866257346e-06, "sampling/sampling_logp_difference/max": 3.2440385818481445, "sampling/sampling_logp_difference/mean": 0.0056603774428367615, "step": 6180, "step_time": 6.848589114658535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.23958333333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 1595.3333333333333, "completions/mean_length": 1044.28125, "completions/mean_terminated_length": 436.1657206217448, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.026028620637953283, "epoch": 0.7439903846153846, "frac_reward_zero_std": 0.0, "grad_norm": 0.00336633762344718, "learning_rate": 2.561298076923077e-07, "loss": -0.0027, "num_tokens": 126051119.0, "reward": 0.7471586068471273, "reward_std": 0.24911563098430634, "rewards/reward_fn/mean": 0.7471586068471273, "rewards/reward_fn/std": 0.2491156260172526, "sampling/importance_sampling_ratio/max": 2.0698062976201377, "sampling/importance_sampling_ratio/mean": 0.41222695509592694, "sampling/importance_sampling_ratio/min": 0.0003240059447004266, "sampling/sampling_logp_difference/max": 1.9519989093144734, "sampling/sampling_logp_difference/mean": 0.005743108845005433, "step": 6190, "step_time": 10.137031169980764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 3000.0, "completions/max_terminated_length": 1565.0, "completions/mean_length": 678.34375, "completions/mean_terminated_length": 341.9677429199219, "completions/min_length": 112.5, "completions/min_terminated_length": 112.5, "entropy": 0.02174437791109085, "epoch": 0.7451923076923077, "frac_reward_zero_std": 0.125, "grad_norm": 0.022531913593411446, "learning_rate": 2.549278846153846e-07, "loss": -0.0046, "num_tokens": 126152453.0, "reward": 0.6633342504501343, "reward_std": 0.36839060485363007, "rewards/reward_fn/mean": 0.6633342504501343, "rewards/reward_fn/std": 0.36839060485363007, "sampling/importance_sampling_ratio/max": 1.411191463470459, "sampling/importance_sampling_ratio/mean": 0.5695602297782898, "sampling/importance_sampling_ratio/min": 0.0004202878083106043, "sampling/sampling_logp_difference/max": 1.8477265238761902, "sampling/sampling_logp_difference/mean": 0.005133889149874449, "step": 6200, "step_time": 6.879880720656365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.16666666666666666, "completions/max_length": 3000.0, "completions/max_terminated_length": 2454.0, "completions/mean_length": 934.8333333333334, "completions/mean_terminated_length": 542.8522542317709, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.02580072022974491, "epoch": 0.7463942307692307, "frac_reward_zero_std": 0.0, "grad_norm": 0.0043939552269876, "learning_rate": 2.5372596153846154e-07, "loss": -0.0052, "num_tokens": 126354981.0, "reward": 0.7894503871599833, "reward_std": 0.2263839840888977, "rewards/reward_fn/mean": 0.7894503871599833, "rewards/reward_fn/std": 0.22638397415479025, "sampling/importance_sampling_ratio/max": 1.3805118004480998, "sampling/importance_sampling_ratio/mean": 0.42099906007448834, "sampling/importance_sampling_ratio/min": 0.00019796784908976406, "sampling/sampling_logp_difference/max": 2.4643187522888184, "sampling/sampling_logp_difference/mean": 0.005623232418050368, "step": 6210, "step_time": 10.059448134992271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2073.5, "completions/max_terminated_length": 1149.5, "completions/mean_length": 408.75, "completions/mean_terminated_length": 324.5500030517578, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.026473480463027953, "epoch": 0.7475961538461539, "frac_reward_zero_std": 0.125, "grad_norm": 0.0390981025993824, "learning_rate": 2.525240384615385e-07, "loss": -0.0025, "num_tokens": 126439365.0, "reward": 0.856959879398346, "reward_std": 0.1020665867254138, "rewards/reward_fn/mean": 0.856959879398346, "rewards/reward_fn/std": 0.10206659277901053, "sampling/importance_sampling_ratio/max": 1.579596221446991, "sampling/importance_sampling_ratio/mean": 0.5148894786834717, "sampling/importance_sampling_ratio/min": 0.008198158226150554, "sampling/sampling_logp_difference/max": 2.9373703002929688, "sampling/sampling_logp_difference/mean": 0.006591839250177145, "step": 6220, "step_time": 4.60745719531551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.20833333333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 1473.6666666666667, "completions/mean_length": 978.0, "completions/mean_terminated_length": 441.4722493489583, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.027406233176589013, "epoch": 0.7487980769230769, "frac_reward_zero_std": 0.0, "grad_norm": 0.013533253222703934, "learning_rate": 2.5132211538461537e-07, "loss": -0.0067, "num_tokens": 126636517.0, "reward": 0.7845401565233866, "reward_std": 0.23439828058083853, "rewards/reward_fn/mean": 0.7845401565233866, "rewards/reward_fn/std": 0.2343982756137848, "sampling/importance_sampling_ratio/max": 1.2569525241851807, "sampling/importance_sampling_ratio/mean": 0.40735456347465515, "sampling/importance_sampling_ratio/min": 0.0003484192340389806, "sampling/sampling_logp_difference/max": 2.878366231918335, "sampling/sampling_logp_difference/mean": 0.006071639402459065, "step": 6230, "step_time": 10.144457901082934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 3000.0, "completions/max_terminated_length": 2267.5, "completions/mean_length": 941.703125, "completions/mean_terminated_length": 606.3966674804688, "completions/min_length": 117.5, "completions/min_terminated_length": 117.5, "entropy": 0.028082165494561197, "epoch": 0.75, "frac_reward_zero_std": 0.0, "grad_norm": 0.007947959937155247, "learning_rate": 2.501201923076923e-07, "loss": 0.0138, "num_tokens": 126766802.0, "reward": 0.8190434873104095, "reward_std": 0.20620882511138916, "rewards/reward_fn/mean": 0.8190434873104095, "rewards/reward_fn/std": 0.20620882511138916, "sampling/importance_sampling_ratio/max": 2.169743299484253, "sampling/importance_sampling_ratio/mean": 0.3949666693806648, "sampling/importance_sampling_ratio/min": 0.001612751426819159, "sampling/sampling_logp_difference/max": 3.364440679550171, "sampling/sampling_logp_difference/mean": 0.006723221158608794, "step": 6240, "step_time": 7.039070412982255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 3000.0, "completions/max_terminated_length": 2059.0, "completions/mean_length": 1061.5833333333333, "completions/mean_terminated_length": 533.3770243326823, "completions/min_length": 139.33333333333334, "completions/min_terminated_length": 139.33333333333334, "entropy": 0.025717976316809655, "epoch": 0.7512019230769231, "frac_reward_zero_std": 0.0, "grad_norm": 0.009164382703602314, "learning_rate": 2.489182692307692e-07, "loss": 0.0012, "num_tokens": 126974458.0, "reward": 0.7754298249880472, "reward_std": 0.21128617723782858, "rewards/reward_fn/mean": 0.7754298249880472, "rewards/reward_fn/std": 0.21128617723782858, "sampling/importance_sampling_ratio/max": 1.5697091420491536, "sampling/importance_sampling_ratio/mean": 0.3703569670518239, "sampling/importance_sampling_ratio/min": 0.00012929074364365079, "sampling/sampling_logp_difference/max": 1.7348146835962932, "sampling/sampling_logp_difference/mean": 0.005706051054100196, "step": 6250, "step_time": 10.064464110974223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3000.0, "completions/max_terminated_length": 1536.5, "completions/mean_length": 1211.328125, "completions/mean_terminated_length": 609.2495727539062, "completions/min_length": 187.5, "completions/min_terminated_length": 187.5, "entropy": 0.02364319358021021, "epoch": 0.7524038461538461, "frac_reward_zero_std": 0.0, "grad_norm": 0.0014131038915365934, "learning_rate": 2.4771634615384615e-07, "loss": 0.0061, "num_tokens": 127115727.0, "reward": 0.7494404315948486, "reward_std": 0.20544517040252686, "rewards/reward_fn/mean": 0.7494404315948486, "rewards/reward_fn/std": 0.20544515550136566, "sampling/importance_sampling_ratio/max": 1.6659674048423767, "sampling/importance_sampling_ratio/mean": 0.3815891444683075, "sampling/importance_sampling_ratio/min": 8.150338544510305e-05, "sampling/sampling_logp_difference/max": 2.3832713961601257, "sampling/sampling_logp_difference/mean": 0.005334565881639719, "step": 6260, "step_time": 6.979578013159335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.22916666666666666, "completions/max_length": 3000.0, "completions/max_terminated_length": 1672.3333333333333, "completions/mean_length": 1060.96875, "completions/mean_terminated_length": 472.34336344401044, "completions/min_length": 131.33333333333334, "completions/min_terminated_length": 131.33333333333334, "entropy": 0.024102209508419035, "epoch": 0.7536057692307693, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.013307558372616768, "learning_rate": 2.465144230769231e-07, "loss": -0.0012, "num_tokens": 127328172.0, "reward": 0.6920252641042074, "reward_std": 0.296814168492953, "rewards/reward_fn/mean": 0.6920252641042074, "rewards/reward_fn/std": 0.2968141585588455, "sampling/importance_sampling_ratio/max": 1.154616395632426, "sampling/importance_sampling_ratio/mean": 0.36917200684547424, "sampling/importance_sampling_ratio/min": 0.0002021818739497879, "sampling/sampling_logp_difference/max": 3.1364585161209106, "sampling/sampling_logp_difference/mean": 0.004974593408405781, "step": 6270, "step_time": 10.240134698990733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 3000.0, "completions/max_terminated_length": 1759.0, "completions/mean_length": 1336.921875, "completions/mean_terminated_length": 478.61058044433594, "completions/min_length": 110.5, "completions/min_terminated_length": 110.5, "entropy": 0.02099884208291769, "epoch": 0.7548076923076923, "frac_reward_zero_std": 0.0, "grad_norm": 0.003246026113629341, "learning_rate": 2.453125e-07, "loss": -0.0066, "num_tokens": 127480711.0, "reward": 0.703516274690628, "reward_std": 0.25771962106227875, "rewards/reward_fn/mean": 0.703516274690628, "rewards/reward_fn/std": 0.25771960616111755, "sampling/importance_sampling_ratio/max": 1.1503158807754517, "sampling/importance_sampling_ratio/mean": 0.2900296673178673, "sampling/importance_sampling_ratio/min": 0.0001918977068271488, "sampling/sampling_logp_difference/max": 1.7428103685379028, "sampling/sampling_logp_difference/mean": 0.005596538307145238, "step": 6280, "step_time": 6.853450087457896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 3000.0, "completions/max_terminated_length": 1782.6666666666667, "completions/mean_length": 721.8645833333334, "completions/mean_terminated_length": 401.40256754557294, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.022710952162742614, "epoch": 0.7560096153846154, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.016201041638851166, "learning_rate": 2.441105769230769e-07, "loss": -0.0032, "num_tokens": 127656618.0, "reward": 0.743995209534963, "reward_std": 0.2605361392100652, "rewards/reward_fn/mean": 0.743995209534963, "rewards/reward_fn/std": 0.26053612927595776, "sampling/importance_sampling_ratio/max": 1.3755874236424763, "sampling/importance_sampling_ratio/mean": 0.5215668578942617, "sampling/importance_sampling_ratio/min": 0.0001724580503529675, "sampling/sampling_logp_difference/max": 3.036620299021403, "sampling/sampling_logp_difference/mean": 0.005381170815477769, "step": 6290, "step_time": 9.773634252138436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 3000.0, "completions/max_terminated_length": 1512.0, "completions/mean_length": 789.40625, "completions/mean_terminated_length": 428.89418029785156, "completions/min_length": 86.5, "completions/min_terminated_length": 86.5, "entropy": 0.023136665113270283, "epoch": 0.7572115384615384, "frac_reward_zero_std": 0.0, "grad_norm": 0.009056886658072472, "learning_rate": 2.429086538461538e-07, "loss": -0.0008, "num_tokens": 127779588.0, "reward": 0.7051724791526794, "reward_std": 0.31369417160749435, "rewards/reward_fn/mean": 0.7051724791526794, "rewards/reward_fn/std": 0.31369417160749435, "sampling/importance_sampling_ratio/max": 1.1271790266036987, "sampling/importance_sampling_ratio/mean": 0.4346010833978653, "sampling/importance_sampling_ratio/min": 0.00039160705637186766, "sampling/sampling_logp_difference/max": 1.8695833683013916, "sampling/sampling_logp_difference/mean": 0.004978695884346962, "step": 6300, "step_time": 6.970181372854858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.17708333333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 2141.6666666666665, "completions/mean_length": 896.3541666666666, "completions/mean_terminated_length": 451.0016682942708, "completions/min_length": 116.33333333333333, "completions/min_terminated_length": 116.33333333333333, "entropy": 0.02496674545109272, "epoch": 0.7584134615384616, "frac_reward_zero_std": 0.0, "grad_norm": 0.013409922830760479, "learning_rate": 2.4170673076923076e-07, "loss": -0.0126, "num_tokens": 127974494.0, "reward": 0.7162526448567709, "reward_std": 0.25514793892701465, "rewards/reward_fn/mean": 0.7162526448567709, "rewards/reward_fn/std": 0.2551479289929072, "sampling/importance_sampling_ratio/max": 1.5796699523925781, "sampling/importance_sampling_ratio/mean": 0.4646098514397939, "sampling/importance_sampling_ratio/min": 0.0005845369790525486, "sampling/sampling_logp_difference/max": 1.6061672767003377, "sampling/sampling_logp_difference/mean": 0.005609122881044944, "step": 6310, "step_time": 9.897959602158517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 3000.0, "completions/max_terminated_length": 1919.5, "completions/mean_length": 962.859375, "completions/mean_terminated_length": 539.2784881591797, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.027844147011637687, "epoch": 0.7596153846153846, "frac_reward_zero_std": 0.0, "grad_norm": 0.011548365466296673, "learning_rate": 2.405048076923077e-07, "loss": 0.0029, "num_tokens": 128109133.0, "reward": 0.7509656548500061, "reward_std": 0.23802949488162994, "rewards/reward_fn/mean": 0.7509656548500061, "rewards/reward_fn/std": 0.23802948743104935, "sampling/importance_sampling_ratio/max": 1.9913705587387085, "sampling/importance_sampling_ratio/mean": 0.4134083241224289, "sampling/importance_sampling_ratio/min": 1.5096168567652057e-05, "sampling/sampling_logp_difference/max": 1.9163222908973694, "sampling/sampling_logp_difference/mean": 0.00621458888053894, "step": 6320, "step_time": 7.070012309867889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3000.0, "completions/max_terminated_length": 2253.6666666666665, "completions/mean_length": 1165.84375, "completions/mean_terminated_length": 549.6691284179688, "completions/min_length": 146.33333333333334, "completions/min_terminated_length": 146.33333333333334, "entropy": 0.027211136557161807, "epoch": 0.7608173076923077, "frac_reward_zero_std": 0.0, "grad_norm": 0.006184997968375683, "learning_rate": 2.393028846153846e-07, "loss": 0.0018, "num_tokens": 128335350.0, "reward": 0.7771288553873698, "reward_std": 0.23461776475111643, "rewards/reward_fn/mean": 0.7771288553873698, "rewards/reward_fn/std": 0.23461775481700897, "sampling/importance_sampling_ratio/max": 1.5574042797088623, "sampling/importance_sampling_ratio/mean": 0.3215153714021047, "sampling/importance_sampling_ratio/min": 0.0001681664355146495, "sampling/sampling_logp_difference/max": 2.2350169022878013, "sampling/sampling_logp_difference/mean": 0.005843599171688159, "step": 6330, "step_time": 10.377913074288518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 3000.0, "completions/max_terminated_length": 1917.5, "completions/mean_length": 1052.96875, "completions/mean_terminated_length": 456.00169372558594, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.023498499765992165, "epoch": 0.7620192307692307, "frac_reward_zero_std": 0.25, "grad_norm": 0.005394944455474615, "learning_rate": 2.381009615384615e-07, "loss": 0.0001, "num_tokens": 128473588.0, "reward": 0.7098814249038696, "reward_std": 0.24179552495479584, "rewards/reward_fn/mean": 0.7098814249038696, "rewards/reward_fn/std": 0.24179552495479584, "sampling/importance_sampling_ratio/max": 1.1283805668354034, "sampling/importance_sampling_ratio/mean": 0.3337247669696808, "sampling/importance_sampling_ratio/min": 4.700501904153498e-06, "sampling/sampling_logp_difference/max": 4.2100220918655396, "sampling/sampling_logp_difference/mean": 0.006380802718922496, "step": 6340, "step_time": 7.1629114927724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.17708333333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 2459.0, "completions/mean_length": 1028.3020833333333, "completions/mean_terminated_length": 609.6059773763021, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.025661179423332216, "epoch": 0.7632211538461539, "frac_reward_zero_std": 0.0, "grad_norm": 0.01955968514084816, "learning_rate": 2.3689903846153845e-07, "loss": -0.0046, "num_tokens": 128682753.0, "reward": 0.7335747480392456, "reward_std": 0.2751753479242325, "rewards/reward_fn/mean": 0.7335747480392456, "rewards/reward_fn/std": 0.27517535785833996, "sampling/importance_sampling_ratio/max": 1.9170605341593425, "sampling/importance_sampling_ratio/mean": 0.4148405094941457, "sampling/importance_sampling_ratio/min": 0.00010249889661887816, "sampling/sampling_logp_difference/max": 1.4208136002222698, "sampling/sampling_logp_difference/mean": 0.005767288462569316, "step": 6350, "step_time": 10.241502574738115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 3000.0, "completions/max_terminated_length": 2717.0, "completions/mean_length": 1348.640625, "completions/mean_terminated_length": 693.3600006103516, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.02349309790879488, "epoch": 0.7644230769230769, "frac_reward_zero_std": 0.0, "grad_norm": 0.006977013312280178, "learning_rate": 2.3569711538461536e-07, "loss": -0.0033, "num_tokens": 128843234.0, "reward": 0.7319533228874207, "reward_std": 0.2513008788228035, "rewards/reward_fn/mean": 0.7319533228874207, "rewards/reward_fn/std": 0.2513008862733841, "sampling/importance_sampling_ratio/max": 1.1911361515522003, "sampling/importance_sampling_ratio/mean": 0.3273128867149353, "sampling/importance_sampling_ratio/min": 1.7135375856014434e-05, "sampling/sampling_logp_difference/max": 2.441097855567932, "sampling/sampling_logp_difference/mean": 0.005323802120983601, "step": 6360, "step_time": 7.116110656782984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 3000.0, "completions/max_terminated_length": 1924.3333333333333, "completions/mean_length": 1229.65625, "completions/mean_terminated_length": 510.73460896809894, "completions/min_length": 131.33333333333334, "completions/min_terminated_length": 131.33333333333334, "entropy": 0.020390500500798225, "epoch": 0.765625, "frac_reward_zero_std": 0.16666666666666666, "grad_norm": 0.0015104797203093767, "learning_rate": 2.344951923076923e-07, "loss": -0.0038, "num_tokens": 129071809.0, "reward": 0.7485354542732239, "reward_std": 0.23066451152165732, "rewards/reward_fn/mean": 0.7485354542732239, "rewards/reward_fn/std": 0.23066449662049612, "sampling/importance_sampling_ratio/max": 1.1118379831314087, "sampling/importance_sampling_ratio/mean": 0.3413255016009013, "sampling/importance_sampling_ratio/min": 5.944032193383464e-05, "sampling/sampling_logp_difference/max": 4.731772502263387, "sampling/sampling_logp_difference/mean": 0.004800131854911645, "step": 6370, "step_time": 9.900555419176817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 3000.0, "completions/max_terminated_length": 1542.5, "completions/mean_length": 1669.03125, "completions/mean_terminated_length": 593.8584289550781, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.024426512233912945, "epoch": 0.7668269230769231, "frac_reward_zero_std": 0.0, "grad_norm": 0.006891779601573944, "learning_rate": 2.3329326923076922e-07, "loss": -0.0033, "num_tokens": 129258139.0, "reward": 0.6855749487876892, "reward_std": 0.24165312200784683, "rewards/reward_fn/mean": 0.6855749487876892, "rewards/reward_fn/std": 0.24165312200784683, "sampling/importance_sampling_ratio/max": 1.1821443140506744, "sampling/importance_sampling_ratio/mean": 0.20584849268198013, "sampling/importance_sampling_ratio/min": 3.046533493034076e-05, "sampling/sampling_logp_difference/max": 1.9580764770507812, "sampling/sampling_logp_difference/mean": 0.005581441335380077, "step": 6380, "step_time": 7.151316049043089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2916666666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 2326.6666666666665, "completions/mean_length": 1309.75, "completions/mean_terminated_length": 615.4285888671875, "completions/min_length": 147.33333333333334, "completions/min_terminated_length": 147.33333333333334, "entropy": 0.022929828613996506, "epoch": 0.7680288461538461, "frac_reward_zero_std": 0.0, "grad_norm": 0.012160804122686386, "learning_rate": 2.3209134615384614e-07, "loss": -0.0018, "num_tokens": 129479755.0, "reward": 0.7184622486432394, "reward_std": 0.2340739369392395, "rewards/reward_fn/mean": 0.7184622486432394, "rewards/reward_fn/std": 0.23407392700513205, "sampling/importance_sampling_ratio/max": 1.9326821168263753, "sampling/importance_sampling_ratio/mean": 0.3890141248703003, "sampling/importance_sampling_ratio/min": 0.0004392962327983696, "sampling/sampling_logp_difference/max": 1.6277374029159546, "sampling/sampling_logp_difference/mean": 0.004767620625595252, "step": 6390, "step_time": 9.995945604145527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 3000.0, "completions/max_terminated_length": 1637.0, "completions/mean_length": 1262.1875, "completions/mean_terminated_length": 427.5911102294922, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.022999062947928906, "epoch": 0.7692307692307693, "frac_reward_zero_std": 0.125, "grad_norm": 0.011029810644686222, "learning_rate": 2.3088942307692308e-07, "loss": 0.0135, "num_tokens": 129632799.0, "reward": 0.7342900633811951, "reward_std": 0.2416999489068985, "rewards/reward_fn/mean": 0.7342900633811951, "rewards/reward_fn/std": 0.2416999414563179, "sampling/importance_sampling_ratio/max": 1.379205346107483, "sampling/importance_sampling_ratio/mean": 0.3977776914834976, "sampling/importance_sampling_ratio/min": 1.943104507518001e-05, "sampling/sampling_logp_difference/max": 3.811688184738159, "sampling/sampling_logp_difference/mean": 0.005294500384479761, "step": 6400, "step_time": 7.154633239284157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.19791666666666666, "completions/max_length": 3000.0, "completions/max_terminated_length": 1735.3333333333333, "completions/mean_length": 974.5520833333334, "completions/mean_terminated_length": 475.5280456542969, "completions/min_length": 115.33333333333333, "completions/min_terminated_length": 115.33333333333333, "entropy": 0.020970470644533633, "epoch": 0.7704326923076923, "frac_reward_zero_std": 0.0, "grad_norm": 0.015020061284303665, "learning_rate": 2.2968749999999997e-07, "loss": 0.0005, "num_tokens": 129826204.0, "reward": 0.750956654548645, "reward_std": 0.24425905446211496, "rewards/reward_fn/mean": 0.750956654548645, "rewards/reward_fn/std": 0.24425905446211496, "sampling/importance_sampling_ratio/max": 1.597430149714152, "sampling/importance_sampling_ratio/mean": 0.5154827137788137, "sampling/importance_sampling_ratio/min": 0.0004753152558502431, "sampling/sampling_logp_difference/max": 5.273748874664307, "sampling/sampling_logp_difference/mean": 0.005259290182342132, "step": 6410, "step_time": 10.188216369692237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 3000.0, "completions/max_terminated_length": 1789.0, "completions/mean_length": 1029.0625, "completions/mean_terminated_length": 549.7399291992188, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.021537656150758266, "epoch": 0.7716346153846154, "frac_reward_zero_std": 0.125, "grad_norm": 0.004855678416788578, "learning_rate": 2.2848557692307691e-07, "loss": 0.0012, "num_tokens": 129970472.0, "reward": 0.7924682796001434, "reward_std": 0.2155575528740883, "rewards/reward_fn/mean": 0.7924682796001434, "rewards/reward_fn/std": 0.2155575454235077, "sampling/importance_sampling_ratio/max": 1.0496763586997986, "sampling/importance_sampling_ratio/mean": 0.4073209762573242, "sampling/importance_sampling_ratio/min": 0.00011396428635634948, "sampling/sampling_logp_difference/max": 2.6411770582199097, "sampling/sampling_logp_difference/mean": 0.0047862837091088295, "step": 6420, "step_time": 7.021123604290187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2708333333333333, "completions/max_length": 3000.0, "completions/max_terminated_length": 1566.0, "completions/mean_length": 1122.25, "completions/mean_terminated_length": 453.30385335286456, "completions/min_length": 92.66666666666667, "completions/min_terminated_length": 92.66666666666667, "entropy": 0.019433072954416274, "epoch": 0.7728365384615384, "frac_reward_zero_std": 0.16666666666666666, "grad_norm": 0.011258325539529324, "learning_rate": 2.2728365384615383e-07, "loss": -0.0054, "num_tokens": 130171208.0, "reward": 0.6155736446380615, "reward_std": 0.2751859575510025, "rewards/reward_fn/mean": 0.6155736446380615, "rewards/reward_fn/std": 0.2751859277486801, "sampling/importance_sampling_ratio/max": 1.4799032807350159, "sampling/importance_sampling_ratio/mean": 0.37806036074956256, "sampling/importance_sampling_ratio/min": 5.222002963970833e-06, "sampling/sampling_logp_difference/max": 1.8974594672520955, "sampling/sampling_logp_difference/mean": 0.00485536417302986, "step": 6430, "step_time": 10.243899238761514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 3000.0, "completions/max_terminated_length": 1826.0, "completions/mean_length": 938.234375, "completions/mean_terminated_length": 560.1992034912109, "completions/min_length": 134.5, "completions/min_terminated_length": 134.5, "entropy": 0.02236058432608843, "epoch": 0.7740384615384616, "frac_reward_zero_std": 0.0, "grad_norm": 0.2664995491504669, "learning_rate": 2.2608173076923077e-07, "loss": -0.0125, "num_tokens": 130296159.0, "reward": 0.7163574397563934, "reward_std": 0.28444670885801315, "rewards/reward_fn/mean": 0.7163574397563934, "rewards/reward_fn/std": 0.28444670885801315, "sampling/importance_sampling_ratio/max": 2.0757537484169006, "sampling/importance_sampling_ratio/mean": 0.3977290987968445, "sampling/importance_sampling_ratio/min": 9.728725399327232e-05, "sampling/sampling_logp_difference/max": 2.102388858795166, "sampling/sampling_logp_difference/mean": 0.0054919603280723095, "step": 6440, "step_time": 6.845690112467855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.23958333333333334, "completions/max_length": 2651.0, "completions/max_terminated_length": 1809.3333333333333, "completions/mean_length": 1139.9583333333333, "completions/mean_terminated_length": 557.8441670735677, "completions/min_length": 192.33333333333334, "completions/min_terminated_length": 192.33333333333334, "entropy": 0.0220164118334651, "epoch": 0.7752403846153846, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.003299495903775096, "learning_rate": 2.248798076923077e-07, "loss": -0.0058, "num_tokens": 130519099.0, "reward": 0.655868927637736, "reward_std": 0.2828322499990463, "rewards/reward_fn/mean": 0.655868927637736, "rewards/reward_fn/std": 0.28283224006493884, "sampling/importance_sampling_ratio/max": 1.348021109898885, "sampling/importance_sampling_ratio/mean": 0.3494677444299062, "sampling/importance_sampling_ratio/min": 0.00044332155474080537, "sampling/sampling_logp_difference/max": 2.7338005701700845, "sampling/sampling_logp_difference/mean": 0.005220706108957529, "step": 6450, "step_time": 8.825800002273173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 3000.0, "completions/max_terminated_length": 1887.0, "completions/mean_length": 783.734375, "completions/mean_terminated_length": 442.5961151123047, "completions/min_length": 94.5, "completions/min_terminated_length": 94.5, "entropy": 0.024755119532346725, "epoch": 0.7764423076923077, "frac_reward_zero_std": 0.0, "grad_norm": 0.02358873374760151, "learning_rate": 2.236778846153846e-07, "loss": 0.0112, "num_tokens": 130643026.0, "reward": 0.7533807754516602, "reward_std": 0.2985115572810173, "rewards/reward_fn/mean": 0.7533807754516602, "rewards/reward_fn/std": 0.2985115721821785, "sampling/importance_sampling_ratio/max": 1.4791312217712402, "sampling/importance_sampling_ratio/mean": 0.4596271216869354, "sampling/importance_sampling_ratio/min": 3.450454369158251e-05, "sampling/sampling_logp_difference/max": 3.699270725250244, "sampling/sampling_logp_difference/mean": 0.005946396850049496, "step": 6460, "step_time": 6.745776568166912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3229166666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 2600.0, "completions/mean_length": 1572.5104166666667, "completions/mean_terminated_length": 910.4766438802084, "completions/min_length": 175.33333333333334, "completions/min_terminated_length": 175.33333333333334, "entropy": 0.02451849803328514, "epoch": 0.7776442307692307, "frac_reward_zero_std": 0.0, "grad_norm": 0.007385291624814272, "learning_rate": 2.2247596153846152e-07, "loss": -0.0028, "num_tokens": 130903043.0, "reward": 0.6856079697608948, "reward_std": 0.27949583530426025, "rewards/reward_fn/mean": 0.6856079697608948, "rewards/reward_fn/std": 0.27949584027131397, "sampling/importance_sampling_ratio/max": 1.8342960278193157, "sampling/importance_sampling_ratio/mean": 0.2026625523964564, "sampling/importance_sampling_ratio/min": 6.784878776973831e-05, "sampling/sampling_logp_difference/max": 3.157327930132548, "sampling/sampling_logp_difference/mean": 0.006009201053529978, "step": 6470, "step_time": 10.397424840833992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 3000.0, "completions/max_terminated_length": 1823.0, "completions/mean_length": 726.484375, "completions/mean_terminated_length": 401.6964416503906, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.02180813644081354, "epoch": 0.7788461538461539, "frac_reward_zero_std": 0.0, "grad_norm": 0.008564473129808903, "learning_rate": 2.2127403846153844e-07, "loss": -0.0086, "num_tokens": 131014066.0, "reward": 0.8276880979537964, "reward_std": 0.17938269674777985, "rewards/reward_fn/mean": 0.8276880979537964, "rewards/reward_fn/std": 0.17938270419836044, "sampling/importance_sampling_ratio/max": 1.1893795728683472, "sampling/importance_sampling_ratio/mean": 0.55002860724926, "sampling/importance_sampling_ratio/min": 0.00016342973185601295, "sampling/sampling_logp_difference/max": 2.015198826789856, "sampling/sampling_logp_difference/mean": 0.004767443053424358, "step": 6480, "step_time": 6.995264174230397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 3000.0, "completions/max_terminated_length": 2237.6666666666665, "completions/mean_length": 813.8125, "completions/mean_terminated_length": 613.3798116048177, "completions/min_length": 147.66666666666666, "completions/min_terminated_length": 147.66666666666666, "entropy": 0.026985703967511654, "epoch": 0.7800480769230769, "frac_reward_zero_std": 0.0, "grad_norm": 0.017563199624419212, "learning_rate": 2.2007211538461538e-07, "loss": -0.0007, "num_tokens": 131195312.0, "reward": 0.7821859916051229, "reward_std": 0.2335844188928604, "rewards/reward_fn/mean": 0.7821859916051229, "rewards/reward_fn/std": 0.2335844337940216, "sampling/importance_sampling_ratio/max": 1.8969378471374512, "sampling/importance_sampling_ratio/mean": 0.3717627425988515, "sampling/importance_sampling_ratio/min": 3.500330725122088e-06, "sampling/sampling_logp_difference/max": 2.044815182685852, "sampling/sampling_logp_difference/mean": 0.0063963850649694605, "step": 6490, "step_time": 9.931190715916454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 3000.0, "completions/max_terminated_length": 1870.0, "completions/mean_length": 1208.34375, "completions/mean_terminated_length": 673.6978454589844, "completions/min_length": 219.5, "completions/min_terminated_length": 219.5, "entropy": 0.024841964803636075, "epoch": 0.78125, "frac_reward_zero_std": 0.0, "grad_norm": 0.00917114783078432, "learning_rate": 2.188701923076923e-07, "loss": -0.0013, "num_tokens": 131336318.0, "reward": 0.7161056995391846, "reward_std": 0.27645187079906464, "rewards/reward_fn/mean": 0.7161056995391846, "rewards/reward_fn/std": 0.27645187079906464, "sampling/importance_sampling_ratio/max": 0.7408380508422852, "sampling/importance_sampling_ratio/mean": 0.23636504262685776, "sampling/importance_sampling_ratio/min": 5.306280581862666e-05, "sampling/sampling_logp_difference/max": 1.5700448155403137, "sampling/sampling_logp_difference/mean": 0.005469013471156359, "step": 6500, "step_time": 7.023140334151686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3020833333333333, "completions/max_length": 3000.0, "completions/max_terminated_length": 1986.3333333333333, "completions/mean_length": 1229.4479166666667, "completions/mean_terminated_length": 472.0290934244792, "completions/min_length": 94.33333333333333, "completions/min_terminated_length": 94.33333333333333, "entropy": 0.024025858752429484, "epoch": 0.7824519230769231, "frac_reward_zero_std": 0.0, "grad_norm": 0.013341572135686874, "learning_rate": 2.1766826923076921e-07, "loss": -0.002, "num_tokens": 131562329.0, "reward": 0.729966918627421, "reward_std": 0.24983691175778708, "rewards/reward_fn/mean": 0.729966918627421, "rewards/reward_fn/std": 0.24983691175778708, "sampling/importance_sampling_ratio/max": 1.3872474829355876, "sampling/importance_sampling_ratio/mean": 0.34647701183954877, "sampling/importance_sampling_ratio/min": 8.93455827129704e-06, "sampling/sampling_logp_difference/max": 2.5955702861150107, "sampling/sampling_logp_difference/mean": 0.005297545498857896, "step": 6510, "step_time": 10.260877505224197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 3000.0, "completions/max_terminated_length": 1494.5, "completions/mean_length": 813.828125, "completions/mean_terminated_length": 457.7970886230469, "completions/min_length": 147.5, "completions/min_terminated_length": 147.5, "entropy": 0.022609376907348634, "epoch": 0.7836538461538461, "frac_reward_zero_std": 0.125, "grad_norm": 0.014686319045722485, "learning_rate": 2.1646634615384616e-07, "loss": 0.0044, "num_tokens": 131677174.0, "reward": 0.7107685804367065, "reward_std": 0.3167833238840103, "rewards/reward_fn/mean": 0.7107685804367065, "rewards/reward_fn/std": 0.3167833387851715, "sampling/importance_sampling_ratio/max": 1.3483315706253052, "sampling/importance_sampling_ratio/mean": 0.47883328795433044, "sampling/importance_sampling_ratio/min": 0.0001250476498171338, "sampling/sampling_logp_difference/max": 6.145913600921631, "sampling/sampling_logp_difference/mean": 0.004720924887806177, "step": 6520, "step_time": 6.6157913046889005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.19791666666666666, "completions/max_length": 3000.0, "completions/max_terminated_length": 2253.6666666666665, "completions/mean_length": 1005.4583333333334, "completions/mean_terminated_length": 512.543701171875, "completions/min_length": 133.33333333333334, "completions/min_terminated_length": 133.33333333333334, "entropy": 0.02337682619690895, "epoch": 0.7848557692307693, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.005754920653998852, "learning_rate": 2.1526442307692307e-07, "loss": -0.0011, "num_tokens": 131891882.0, "reward": 0.7122276226679484, "reward_std": 0.29582105080286664, "rewards/reward_fn/mean": 0.7122276226679484, "rewards/reward_fn/std": 0.29582106073697406, "sampling/importance_sampling_ratio/max": 1.3354840278625488, "sampling/importance_sampling_ratio/mean": 0.44652629892031354, "sampling/importance_sampling_ratio/min": 9.211382651604557e-06, "sampling/sampling_logp_difference/max": 1.7231919368108113, "sampling/sampling_logp_difference/mean": 0.004943746142089367, "step": 6530, "step_time": 10.258027809578925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 3000.0, "completions/max_terminated_length": 1996.5, "completions/mean_length": 1002.234375, "completions/mean_terminated_length": 538.4273986816406, "completions/min_length": 193.5, "completions/min_terminated_length": 193.5, "entropy": 0.02484949743375182, "epoch": 0.7860576923076923, "frac_reward_zero_std": 0.0, "grad_norm": 0.0031422271858900785, "learning_rate": 2.140625e-07, "loss": 0.0085, "num_tokens": 132019417.0, "reward": 0.8052859902381897, "reward_std": 0.22535612434148788, "rewards/reward_fn/mean": 0.8052859902381897, "rewards/reward_fn/std": 0.22535613179206848, "sampling/importance_sampling_ratio/max": 1.3672587871551514, "sampling/importance_sampling_ratio/mean": 0.35773882269859314, "sampling/importance_sampling_ratio/min": 0.0003677115379900897, "sampling/sampling_logp_difference/max": 3.9303612112998962, "sampling/sampling_logp_difference/mean": 0.005882231052964926, "step": 6540, "step_time": 7.0045367635786535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2380.6666666666665, "completions/mean_length": 1359.53125, "completions/mean_terminated_length": 617.2916666666666, "completions/min_length": 155.66666666666666, "completions/min_terminated_length": 155.66666666666666, "entropy": 0.024561302736401558, "epoch": 0.7872596153846154, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.03421666845679283, "learning_rate": 2.128605769230769e-07, "loss": -0.0016, "num_tokens": 132276748.0, "reward": 0.7342647910118103, "reward_std": 0.2654726604620616, "rewards/reward_fn/mean": 0.7342647910118103, "rewards/reward_fn/std": 0.265472670396169, "sampling/importance_sampling_ratio/max": 1.7912176847457886, "sampling/importance_sampling_ratio/mean": 0.365395466486613, "sampling/importance_sampling_ratio/min": 1.4955793773197001e-05, "sampling/sampling_logp_difference/max": 2.5514602263768515, "sampling/sampling_logp_difference/mean": 0.005414800718426704, "step": 6550, "step_time": 10.547477302979678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2655.5, "completions/mean_length": 1316.609375, "completions/mean_terminated_length": 551.4318237304688, "completions/min_length": 125.5, "completions/min_terminated_length": 125.5, "entropy": 0.0269907645881176, "epoch": 0.7884615384615384, "frac_reward_zero_std": 0.0, "grad_norm": 0.0036833127960562706, "learning_rate": 2.1165865384615382e-07, "loss": 0.0011, "num_tokens": 132493987.0, "reward": 0.6874318420886993, "reward_std": 0.25198687613010406, "rewards/reward_fn/mean": 0.6874318420886993, "rewards/reward_fn/std": 0.25198689103126526, "sampling/importance_sampling_ratio/max": 1.7238636016845703, "sampling/importance_sampling_ratio/mean": 0.3569500148296356, "sampling/importance_sampling_ratio/min": 2.745462845155089e-05, "sampling/sampling_logp_difference/max": 2.413658857345581, "sampling/sampling_logp_difference/mean": 0.005816133227199316, "step": 6560, "step_time": 7.94642373463139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.19791666666666666, "completions/max_length": 3000.0, "completions/max_terminated_length": 2299.0, "completions/mean_length": 1131.0625, "completions/mean_terminated_length": 693.6716003417969, "completions/min_length": 171.66666666666666, "completions/min_terminated_length": 171.66666666666666, "entropy": 0.027014880441129208, "epoch": 0.7896634615384616, "frac_reward_zero_std": 0.0, "grad_norm": 0.011259070597589016, "learning_rate": 2.1045673076923076e-07, "loss": -0.0035, "num_tokens": 132705841.0, "reward": 0.7805444995562235, "reward_std": 0.21443259219328561, "rewards/reward_fn/mean": 0.7805444995562235, "rewards/reward_fn/std": 0.2144325872262319, "sampling/importance_sampling_ratio/max": 1.3073008855183919, "sampling/importance_sampling_ratio/mean": 0.35898177325725555, "sampling/importance_sampling_ratio/min": 1.146548591653603e-05, "sampling/sampling_logp_difference/max": 3.1891748905181885, "sampling/sampling_logp_difference/mean": 0.005825839626292388, "step": 6570, "step_time": 10.036884661763906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 3000.0, "completions/max_terminated_length": 1592.5, "completions/mean_length": 1434.84375, "completions/mean_terminated_length": 449.1666717529297, "completions/min_length": 150.5, "completions/min_terminated_length": 150.5, "entropy": 0.023835280165076255, "epoch": 0.7908653846153846, "frac_reward_zero_std": 0.0, "grad_norm": 0.009387659840285778, "learning_rate": 2.0925480769230768e-07, "loss": -0.0104, "num_tokens": 132869351.0, "reward": 0.559809684753418, "reward_std": 0.3367984741926193, "rewards/reward_fn/mean": 0.559809684753418, "rewards/reward_fn/std": 0.33679845929145813, "sampling/importance_sampling_ratio/max": 1.358918696641922, "sampling/importance_sampling_ratio/mean": 0.33925241976976395, "sampling/importance_sampling_ratio/min": 0.0001255723057056457, "sampling/sampling_logp_difference/max": 3.5866798162460327, "sampling/sampling_logp_difference/mean": 0.005378775997087359, "step": 6580, "step_time": 7.097577294893563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.14583333333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 2286.3333333333335, "completions/mean_length": 891.8958333333334, "completions/mean_terminated_length": 534.6236470540365, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.026517597399652003, "epoch": 0.7920673076923077, "frac_reward_zero_std": 0.0, "grad_norm": 0.012581334449350834, "learning_rate": 2.0805288461538462e-07, "loss": -0.0075, "num_tokens": 133057861.0, "reward": 0.7603442867596945, "reward_std": 0.2702958931525548, "rewards/reward_fn/mean": 0.7603442867596945, "rewards/reward_fn/std": 0.2702959030866623, "sampling/importance_sampling_ratio/max": 1.8416343927383423, "sampling/importance_sampling_ratio/mean": 0.43478189905484516, "sampling/importance_sampling_ratio/min": 0.00010590544142511742, "sampling/sampling_logp_difference/max": 2.6947312355041504, "sampling/sampling_logp_difference/mean": 0.005884654199083646, "step": 6590, "step_time": 9.956350722815841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 3000.0, "completions/max_terminated_length": 2181.5, "completions/mean_length": 996.234375, "completions/mean_terminated_length": 581.0427551269531, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.023529221676290034, "epoch": 0.7932692307692307, "frac_reward_zero_std": 0.0, "grad_norm": 0.00227147713303566, "learning_rate": 2.068509615384615e-07, "loss": -0.0055, "num_tokens": 133193916.0, "reward": 0.7924284934997559, "reward_std": 0.21060914546251297, "rewards/reward_fn/mean": 0.7924284934997559, "rewards/reward_fn/std": 0.21060914546251297, "sampling/importance_sampling_ratio/max": 1.1130031943321228, "sampling/importance_sampling_ratio/mean": 0.31793801486492157, "sampling/importance_sampling_ratio/min": 0.00010798356152008637, "sampling/sampling_logp_difference/max": 1.8339700102806091, "sampling/sampling_logp_difference/mean": 0.005292035406455398, "step": 6600, "step_time": 6.9679945667274295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 3000.0, "completions/max_terminated_length": 2543.6666666666665, "completions/mean_length": 914.4583333333334, "completions/mean_terminated_length": 536.0920817057291, "completions/min_length": 125.33333333333333, "completions/min_terminated_length": 125.33333333333333, "entropy": 0.02373937163501978, "epoch": 0.7944711538461539, "frac_reward_zero_std": 0.0, "grad_norm": 0.020839033648371696, "learning_rate": 2.0564903846153846e-07, "loss": -0.001, "num_tokens": 133379320.0, "reward": 0.7674170732498169, "reward_std": 0.2539145201444626, "rewards/reward_fn/mean": 0.7674170732498169, "rewards/reward_fn/std": 0.2539145201444626, "sampling/importance_sampling_ratio/max": 1.5049407084782918, "sampling/importance_sampling_ratio/mean": 0.46292150020599365, "sampling/importance_sampling_ratio/min": 0.00044102051955026883, "sampling/sampling_logp_difference/max": 1.8866315285364788, "sampling/sampling_logp_difference/mean": 0.005424941889941692, "step": 6610, "step_time": 10.28979463884607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 3000.0, "completions/max_terminated_length": 1694.5, "completions/mean_length": 759.40625, "completions/mean_terminated_length": 480.9248962402344, "completions/min_length": 123.5, "completions/min_terminated_length": 123.5, "entropy": 0.02462581731379032, "epoch": 0.7956730769230769, "frac_reward_zero_std": 0.0, "grad_norm": 0.01829206570982933, "learning_rate": 2.0444711538461537e-07, "loss": -0.0068, "num_tokens": 133508882.0, "reward": 0.8067486882209778, "reward_std": 0.24398691207170486, "rewards/reward_fn/mean": 0.8067486882209778, "rewards/reward_fn/std": 0.24398691952228546, "sampling/importance_sampling_ratio/max": 1.8114193677902222, "sampling/importance_sampling_ratio/mean": 0.4547928124666214, "sampling/importance_sampling_ratio/min": 4.4205255107954144e-05, "sampling/sampling_logp_difference/max": 1.9690753817558289, "sampling/sampling_logp_difference/mean": 0.006303089438006282, "step": 6620, "step_time": 6.865868071187288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2604166666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 2289.0, "completions/mean_length": 1257.4270833333333, "completions/mean_terminated_length": 679.584706624349, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.025565560348331927, "epoch": 0.796875, "frac_reward_zero_std": 0.0, "grad_norm": 0.010480105876922607, "learning_rate": 2.032451923076923e-07, "loss": -0.0084, "num_tokens": 133737243.0, "reward": 0.7450114488601685, "reward_std": 0.22603963812192282, "rewards/reward_fn/mean": 0.7450114488601685, "rewards/reward_fn/std": 0.22603963812192282, "sampling/importance_sampling_ratio/max": 1.3415966828664143, "sampling/importance_sampling_ratio/mean": 0.3343837410211563, "sampling/importance_sampling_ratio/min": 3.4739812235784484e-05, "sampling/sampling_logp_difference/max": 2.1073215007781982, "sampling/sampling_logp_difference/mean": 0.005699796291689078, "step": 6630, "step_time": 10.156770315859466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2340.5, "completions/mean_length": 1347.625, "completions/mean_terminated_length": 614.5791778564453, "completions/min_length": 171.5, "completions/min_terminated_length": 171.5, "entropy": 0.024030371196568014, "epoch": 0.7980769230769231, "frac_reward_zero_std": 0.0, "grad_norm": 0.004449316766113043, "learning_rate": 2.0204326923076923e-07, "loss": -0.0057, "num_tokens": 133904075.0, "reward": 0.7497895956039429, "reward_std": 0.21974999457597733, "rewards/reward_fn/mean": 0.7497895956039429, "rewards/reward_fn/std": 0.21974999457597733, "sampling/importance_sampling_ratio/max": 1.0768387913703918, "sampling/importance_sampling_ratio/mean": 0.2788920998573303, "sampling/importance_sampling_ratio/min": 3.803688423431595e-05, "sampling/sampling_logp_difference/max": 2.494473457336426, "sampling/sampling_logp_difference/mean": 0.004690306028351188, "step": 6640, "step_time": 7.151875993050635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3020833333333333, "completions/max_length": 3000.0, "completions/max_terminated_length": 2012.6666666666667, "completions/mean_length": 1265.5, "completions/mean_terminated_length": 515.1297912597656, "completions/min_length": 142.33333333333334, "completions/min_terminated_length": 142.33333333333334, "entropy": 0.024968463554978372, "epoch": 0.7992788461538461, "frac_reward_zero_std": 0.0, "grad_norm": 0.005247250199317932, "learning_rate": 2.0084134615384615e-07, "loss": -0.002, "num_tokens": 134131603.0, "reward": 0.7064063549041748, "reward_std": 0.25836629668871564, "rewards/reward_fn/mean": 0.7064063549041748, "rewards/reward_fn/std": 0.25836629172166187, "sampling/importance_sampling_ratio/max": 1.313445468743642, "sampling/importance_sampling_ratio/mean": 0.3133295675118764, "sampling/importance_sampling_ratio/min": 3.477730957020716e-05, "sampling/sampling_logp_difference/max": 4.655890464782715, "sampling/sampling_logp_difference/mean": 0.005611242571224769, "step": 6650, "step_time": 10.37171946791932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 3000.0, "completions/max_terminated_length": 2923.5, "completions/mean_length": 1291.734375, "completions/mean_terminated_length": 684.8223571777344, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.020463802479207517, "epoch": 0.8004807692307693, "frac_reward_zero_std": 0.0, "grad_norm": 0.03816848620772362, "learning_rate": 1.996394230769231e-07, "loss": -0.0071, "num_tokens": 134283586.0, "reward": 0.7383715510368347, "reward_std": 0.2599829211831093, "rewards/reward_fn/mean": 0.7383715510368347, "rewards/reward_fn/std": 0.2599829137325287, "sampling/importance_sampling_ratio/max": 1.3908791542053223, "sampling/importance_sampling_ratio/mean": 0.38776957988739014, "sampling/importance_sampling_ratio/min": 0.00045306324318517, "sampling/sampling_logp_difference/max": 1.9005358815193176, "sampling/sampling_logp_difference/mean": 0.004748164676129818, "step": 6660, "step_time": 7.151619357988238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.13541666666666666, "completions/max_length": 3000.0, "completions/max_terminated_length": 2058.3333333333335, "completions/mean_length": 796.875, "completions/mean_terminated_length": 456.32940673828125, "completions/min_length": 136.66666666666666, "completions/min_terminated_length": 136.66666666666666, "entropy": 0.026962605118751527, "epoch": 0.8016826923076923, "frac_reward_zero_std": 0.0, "grad_norm": 0.018853724002838135, "learning_rate": 1.9843749999999998e-07, "loss": -0.0011, "num_tokens": 134467070.0, "reward": 0.7623232007026672, "reward_std": 0.2628633677959442, "rewards/reward_fn/mean": 0.7623232007026672, "rewards/reward_fn/std": 0.2628633677959442, "sampling/importance_sampling_ratio/max": 2.141475955645243, "sampling/importance_sampling_ratio/mean": 0.4394506017367045, "sampling/importance_sampling_ratio/min": 0.0003013072619069135, "sampling/sampling_logp_difference/max": 1.575763424237569, "sampling/sampling_logp_difference/mean": 0.0056268506062527495, "step": 6670, "step_time": 10.066004116274417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 3000.0, "completions/max_terminated_length": 1766.0, "completions/mean_length": 827.15625, "completions/mean_terminated_length": 421.77919006347656, "completions/min_length": 117.5, "completions/min_terminated_length": 117.5, "entropy": 0.020957073755562307, "epoch": 0.8028846153846154, "frac_reward_zero_std": 0.0, "grad_norm": 0.04279366508126259, "learning_rate": 1.972355769230769e-07, "loss": -0.0001, "num_tokens": 134589448.0, "reward": 0.7761657238006592, "reward_std": 0.22822744399309158, "rewards/reward_fn/mean": 0.7761657238006592, "rewards/reward_fn/std": 0.22822745144367218, "sampling/importance_sampling_ratio/max": 1.4967018365859985, "sampling/importance_sampling_ratio/mean": 0.42840996384620667, "sampling/importance_sampling_ratio/min": 0.0012986955189262517, "sampling/sampling_logp_difference/max": 2.0398924350738525, "sampling/sampling_logp_difference/mean": 0.005450173746794462, "step": 6680, "step_time": 6.706834557373076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 3000.0, "completions/max_terminated_length": 1872.0, "completions/mean_length": 1230.3333333333333, "completions/mean_terminated_length": 556.4324340820312, "completions/min_length": 120.33333333333333, "completions/min_terminated_length": 120.33333333333333, "entropy": 0.027551425248384477, "epoch": 0.8040865384615384, "frac_reward_zero_std": 0.0, "grad_norm": 0.005041729658842087, "learning_rate": 1.9603365384615384e-07, "loss": -0.0045, "num_tokens": 134836272.0, "reward": 0.7281659841537476, "reward_std": 0.24357961614926657, "rewards/reward_fn/mean": 0.7281659841537476, "rewards/reward_fn/std": 0.24357960124810538, "sampling/importance_sampling_ratio/max": 1.3613827625910442, "sampling/importance_sampling_ratio/mean": 0.3032987713813782, "sampling/importance_sampling_ratio/min": 6.621192975823457e-06, "sampling/sampling_logp_difference/max": 2.669799566268921, "sampling/sampling_logp_difference/mean": 0.0055861487829436856, "step": 6690, "step_time": 10.386154147516937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2041.5, "completions/mean_length": 729.390625, "completions/mean_terminated_length": 536.6534729003906, "completions/min_length": 155.5, "completions/min_terminated_length": 155.5, "entropy": 0.02263471782207489, "epoch": 0.8052884615384616, "frac_reward_zero_std": 0.125, "grad_norm": 0.01128763984888792, "learning_rate": 1.9483173076923075e-07, "loss": 0.0029, "num_tokens": 134946881.0, "reward": 0.739230215549469, "reward_std": 0.29170212149620056, "rewards/reward_fn/mean": 0.739230215549469, "rewards/reward_fn/std": 0.29170212149620056, "sampling/importance_sampling_ratio/max": 2.1679192781448364, "sampling/importance_sampling_ratio/mean": 0.4871691167354584, "sampling/importance_sampling_ratio/min": 2.513936851755716e-05, "sampling/sampling_logp_difference/max": 1.744110107421875, "sampling/sampling_logp_difference/mean": 0.005567749962210655, "step": 6700, "step_time": 6.87407914698124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.17708333333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 950.7083333333334, "completions/mean_terminated_length": 508.9241027832031, "completions/min_length": 135.33333333333334, "completions/min_terminated_length": 135.33333333333334, "entropy": 0.02854470405727625, "epoch": 0.8064903846153846, "frac_reward_zero_std": 0.0, "grad_norm": 0.015094408765435219, "learning_rate": 1.936298076923077e-07, "loss": -0.0108, "num_tokens": 135146069.0, "reward": 0.7727883458137512, "reward_std": 0.24273759126663208, "rewards/reward_fn/mean": 0.7727883458137512, "rewards/reward_fn/std": 0.24273758629957834, "sampling/importance_sampling_ratio/max": 1.6713361740112305, "sampling/importance_sampling_ratio/mean": 0.41877217094103497, "sampling/importance_sampling_ratio/min": 0.0002391600727757274, "sampling/sampling_logp_difference/max": 2.4932690858840942, "sampling/sampling_logp_difference/mean": 0.005927203843990962, "step": 6710, "step_time": 10.333343376405537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 3000.0, "completions/max_terminated_length": 1060.5, "completions/mean_length": 1647.53125, "completions/mean_terminated_length": 364.0281982421875, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.023452695831656455, "epoch": 0.8076923076923077, "frac_reward_zero_std": 0.0, "grad_norm": 0.00853606779128313, "learning_rate": 1.9242788461538461e-07, "loss": -0.0018, "num_tokens": 135325231.0, "reward": 0.6373376250267029, "reward_std": 0.24664133042097092, "rewards/reward_fn/mean": 0.6373376250267029, "rewards/reward_fn/std": 0.24664131551980972, "sampling/importance_sampling_ratio/max": 1.1374436020851135, "sampling/importance_sampling_ratio/mean": 0.2599187344312668, "sampling/importance_sampling_ratio/min": 1.1985082750687326e-06, "sampling/sampling_logp_difference/max": 6.507355451583862, "sampling/sampling_logp_difference/mean": 0.0053941921796649694, "step": 6720, "step_time": 7.194619513861835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3000.0, "completions/max_terminated_length": 2016.6666666666667, "completions/mean_length": 1167.3125, "completions/mean_terminated_length": 540.5662943522135, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.023271008860319853, "epoch": 0.8088942307692307, "frac_reward_zero_std": 0.0, "grad_norm": 0.023146916180849075, "learning_rate": 1.912259615384615e-07, "loss": 0.0124, "num_tokens": 135536949.0, "reward": 0.7320713599522909, "reward_std": 0.25613416234652203, "rewards/reward_fn/mean": 0.7320713599522909, "rewards/reward_fn/std": 0.25613416234652203, "sampling/importance_sampling_ratio/max": 1.347498854001363, "sampling/importance_sampling_ratio/mean": 0.36987149715423584, "sampling/importance_sampling_ratio/min": 5.434205559140537e-05, "sampling/sampling_logp_difference/max": 1.7095420360565186, "sampling/sampling_logp_difference/mean": 0.005366539272169272, "step": 6730, "step_time": 10.322194720897823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 3000.0, "completions/max_terminated_length": 1289.5, "completions/mean_length": 953.640625, "completions/mean_terminated_length": 336.3333435058594, "completions/min_length": 88.5, "completions/min_terminated_length": 88.5, "entropy": 0.02005878584459424, "epoch": 0.8100961538461539, "frac_reward_zero_std": 0.25, "grad_norm": 0.005600991193205118, "learning_rate": 1.9002403846153845e-07, "loss": -0.0085, "num_tokens": 135657510.0, "reward": 0.6599214375019073, "reward_std": 0.33182238042354584, "rewards/reward_fn/mean": 0.6599214375019073, "rewards/reward_fn/std": 0.33182236552238464, "sampling/importance_sampling_ratio/max": 1.8017452359199524, "sampling/importance_sampling_ratio/mean": 0.5183573663234711, "sampling/importance_sampling_ratio/min": 0.00032685676160326693, "sampling/sampling_logp_difference/max": 3.849126935005188, "sampling/sampling_logp_difference/mean": 0.004470109357498586, "step": 6740, "step_time": 6.652507496532053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3000.0, "completions/max_terminated_length": 2211.6666666666665, "completions/mean_length": 1146.5, "completions/mean_terminated_length": 538.1156819661459, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.02336325952783227, "epoch": 0.8112980769230769, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.018128175288438797, "learning_rate": 1.8882211538461536e-07, "loss": -0.0055, "num_tokens": 135871158.0, "reward": 0.6665361126263937, "reward_std": 0.2977451632420222, "rewards/reward_fn/mean": 0.6665361126263937, "rewards/reward_fn/std": 0.29774515827496845, "sampling/importance_sampling_ratio/max": 2.409237782160441, "sampling/importance_sampling_ratio/mean": 0.433979332447052, "sampling/importance_sampling_ratio/min": 9.19198840468501e-05, "sampling/sampling_logp_difference/max": 2.087180256843567, "sampling/sampling_logp_difference/mean": 0.005216107005253434, "step": 6750, "step_time": 10.322908541280777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 3000.0, "completions/max_terminated_length": 2158.5, "completions/mean_length": 962.765625, "completions/mean_terminated_length": 593.4042053222656, "completions/min_length": 141.5, "completions/min_terminated_length": 141.5, "entropy": 0.025652704946696758, "epoch": 0.8125, "frac_reward_zero_std": 0.0, "grad_norm": 0.02165539376437664, "learning_rate": 1.876201923076923e-07, "loss": 0.0022, "num_tokens": 136004191.0, "reward": 0.8226109147071838, "reward_std": 0.17708273977041245, "rewards/reward_fn/mean": 0.8226109147071838, "rewards/reward_fn/std": 0.17708273977041245, "sampling/importance_sampling_ratio/max": 1.161408007144928, "sampling/importance_sampling_ratio/mean": 0.32737742364406586, "sampling/importance_sampling_ratio/min": 2.575672624516301e-05, "sampling/sampling_logp_difference/max": 1.8987823724746704, "sampling/sampling_logp_difference/mean": 0.006623285822570324, "step": 6760, "step_time": 6.695540511142463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2044.3333333333333, "completions/mean_length": 1363.9479166666667, "completions/mean_terminated_length": 743.0386352539062, "completions/min_length": 224.33333333333334, "completions/min_terminated_length": 224.33333333333334, "entropy": 0.020955198630690574, "epoch": 0.8137019230769231, "frac_reward_zero_std": 0.0, "grad_norm": 0.013116489164531231, "learning_rate": 1.8641826923076922e-07, "loss": -0.0003, "num_tokens": 136227850.0, "reward": 0.7286752859751383, "reward_std": 0.2610810150702794, "rewards/reward_fn/mean": 0.7286752859751383, "rewards/reward_fn/std": 0.2610810250043869, "sampling/importance_sampling_ratio/max": 1.3731770515441895, "sampling/importance_sampling_ratio/mean": 0.31190918385982513, "sampling/importance_sampling_ratio/min": 0.0003928329169866629, "sampling/sampling_logp_difference/max": 2.247061332066854, "sampling/sampling_logp_difference/mean": 0.004957205771158139, "step": 6770, "step_time": 10.280849102605135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 3000.0, "completions/max_terminated_length": 1792.0, "completions/mean_length": 878.25, "completions/mean_terminated_length": 495.34344482421875, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.024243224784731864, "epoch": 0.8149038461538461, "frac_reward_zero_std": 0.0, "grad_norm": 0.010213176719844341, "learning_rate": 1.8521634615384616e-07, "loss": -0.0142, "num_tokens": 136365306.0, "reward": 0.8334711492061615, "reward_std": 0.1803877204656601, "rewards/reward_fn/mean": 0.8334711492061615, "rewards/reward_fn/std": 0.1803877204656601, "sampling/importance_sampling_ratio/max": 1.0701108872890472, "sampling/importance_sampling_ratio/mean": 0.3711470067501068, "sampling/importance_sampling_ratio/min": 0.00018633904574016924, "sampling/sampling_logp_difference/max": 2.2633256316184998, "sampling/sampling_logp_difference/mean": 0.0061378781683743, "step": 6780, "step_time": 6.7472599891014395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.22916666666666666, "completions/max_length": 3000.0, "completions/max_terminated_length": 1937.0, "completions/mean_length": 1146.5625, "completions/mean_terminated_length": 599.0428568522135, "completions/min_length": 154.33333333333334, "completions/min_terminated_length": 154.33333333333334, "entropy": 0.018343249429017307, "epoch": 0.8161057692307693, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.012544178403913975, "learning_rate": 1.8401442307692308e-07, "loss": 0.0112, "num_tokens": 136587880.0, "reward": 0.7414333820343018, "reward_std": 0.2510773589213689, "rewards/reward_fn/mean": 0.7414333820343018, "rewards/reward_fn/std": 0.2510773589213689, "sampling/importance_sampling_ratio/max": 2.2635221083958945, "sampling/importance_sampling_ratio/mean": 0.43860403696695965, "sampling/importance_sampling_ratio/min": 3.4894061236911966e-05, "sampling/sampling_logp_difference/max": 5.207434733708699, "sampling/sampling_logp_difference/mean": 0.004392832672844331, "step": 6790, "step_time": 10.349603976123035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2502.0, "completions/mean_length": 1523.21875, "completions/mean_terminated_length": 662.0204772949219, "completions/min_length": 161.5, "completions/min_terminated_length": 161.5, "entropy": 0.024387188255786896, "epoch": 0.8173076923076923, "frac_reward_zero_std": 0.0, "grad_norm": 0.002327181864529848, "learning_rate": 1.8281249999999997e-07, "loss": 0.0044, "num_tokens": 136785878.0, "reward": 0.691668838262558, "reward_std": 0.26011117547750473, "rewards/reward_fn/mean": 0.691668838262558, "rewards/reward_fn/std": 0.2601111903786659, "sampling/importance_sampling_ratio/max": 1.1970969140529633, "sampling/importance_sampling_ratio/mean": 0.2538820803165436, "sampling/importance_sampling_ratio/min": 1.58010897166605e-06, "sampling/sampling_logp_difference/max": 4.8477301597595215, "sampling/sampling_logp_difference/mean": 0.005209984956309199, "step": 6800, "step_time": 7.5511665438301865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.19791666666666666, "completions/max_length": 3000.0, "completions/max_terminated_length": 2094.0, "completions/mean_length": 1029.90625, "completions/mean_terminated_length": 547.8228251139323, "completions/min_length": 150.66666666666666, "completions/min_terminated_length": 150.66666666666666, "entropy": 0.023898510076105595, "epoch": 0.8185096153846154, "frac_reward_zero_std": 0.0, "grad_norm": 0.013646990992128849, "learning_rate": 1.816105769230769e-07, "loss": 0.0034, "num_tokens": 136974645.0, "reward": 0.8230737447738647, "reward_std": 0.18862813711166382, "rewards/reward_fn/mean": 0.8230737447738647, "rewards/reward_fn/std": 0.18862813214461008, "sampling/importance_sampling_ratio/max": 1.3926626443862915, "sampling/importance_sampling_ratio/mean": 0.36493950088818866, "sampling/importance_sampling_ratio/min": 0.00023679643443941245, "sampling/sampling_logp_difference/max": 3.4843742847442627, "sampling/sampling_logp_difference/mean": 0.005413846578449011, "step": 6810, "step_time": 9.97635239744559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 3000.0, "completions/max_terminated_length": 1794.0, "completions/mean_length": 1462.796875, "completions/mean_terminated_length": 540.4750061035156, "completions/min_length": 168.5, "completions/min_terminated_length": 168.5, "entropy": 0.024325793236494066, "epoch": 0.8197115384615384, "frac_reward_zero_std": 0.0, "grad_norm": 0.008669666014611721, "learning_rate": 1.8040865384615383e-07, "loss": 0.0031, "num_tokens": 137153832.0, "reward": 0.6950986683368683, "reward_std": 0.26122091710567474, "rewards/reward_fn/mean": 0.6950986683368683, "rewards/reward_fn/std": 0.26122093200683594, "sampling/importance_sampling_ratio/max": 1.6436331868171692, "sampling/importance_sampling_ratio/mean": 0.3580250144004822, "sampling/importance_sampling_ratio/min": 1.54410545292194e-05, "sampling/sampling_logp_difference/max": 2.6943869590759277, "sampling/sampling_logp_difference/mean": 0.005149191478267312, "step": 6820, "step_time": 7.16273885127157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.23958333333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 1557.0, "completions/mean_length": 1188.3854166666667, "completions/mean_terminated_length": 634.2962951660156, "completions/min_length": 144.66666666666666, "completions/min_terminated_length": 144.66666666666666, "entropy": 0.02751188203692436, "epoch": 0.8209134615384616, "frac_reward_zero_std": 0.0, "grad_norm": 0.016119293868541718, "learning_rate": 1.7920673076923077e-07, "loss": 0.0005, "num_tokens": 137375061.0, "reward": 0.7656160394350687, "reward_std": 0.23724613090356192, "rewards/reward_fn/mean": 0.7656160394350687, "rewards/reward_fn/std": 0.23724614083766937, "sampling/importance_sampling_ratio/max": 1.034310241540273, "sampling/importance_sampling_ratio/mean": 0.27879906197388965, "sampling/importance_sampling_ratio/min": 2.885842347192617e-05, "sampling/sampling_logp_difference/max": 4.896727720896403, "sampling/sampling_logp_difference/mean": 0.0058268290013074875, "step": 6830, "step_time": 10.2054557191208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 3000.0, "completions/max_terminated_length": 2032.5, "completions/mean_length": 1340.453125, "completions/mean_terminated_length": 644.3083038330078, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.02663444373756647, "epoch": 0.8221153846153846, "frac_reward_zero_std": 0.0, "grad_norm": 0.01960519701242447, "learning_rate": 1.780048076923077e-07, "loss": -0.0051, "num_tokens": 137530506.0, "reward": 0.727553516626358, "reward_std": 0.2524624839425087, "rewards/reward_fn/mean": 0.727553516626358, "rewards/reward_fn/std": 0.2524624839425087, "sampling/importance_sampling_ratio/max": 1.4836518168449402, "sampling/importance_sampling_ratio/mean": 0.2200494185090065, "sampling/importance_sampling_ratio/min": 0.0001850717330853513, "sampling/sampling_logp_difference/max": 2.94562029838562, "sampling/sampling_logp_difference/mean": 0.005769022507593036, "step": 6840, "step_time": 7.088755497615784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 3000.0, "completions/max_terminated_length": 2064.6666666666665, "completions/mean_length": 1046.6458333333333, "completions/mean_terminated_length": 494.7965596516927, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.020814773440361024, "epoch": 0.8233173076923077, "frac_reward_zero_std": 0.16666666666666666, "grad_norm": 0.008253254927694798, "learning_rate": 1.768028846153846e-07, "loss": 0.0001, "num_tokens": 137731344.0, "reward": 0.7606135408083597, "reward_std": 0.23464014132817587, "rewards/reward_fn/mean": 0.7606135408083597, "rewards/reward_fn/std": 0.23464013636112213, "sampling/importance_sampling_ratio/max": 1.1133031249046326, "sampling/importance_sampling_ratio/mean": 0.3920016884803772, "sampling/importance_sampling_ratio/min": 6.661914354329686e-05, "sampling/sampling_logp_difference/max": 2.384853482246399, "sampling/sampling_logp_difference/mean": 0.004817136563360691, "step": 6850, "step_time": 10.112236562371255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 3000.0, "completions/max_terminated_length": 2164.5, "completions/mean_length": 793.65625, "completions/mean_terminated_length": 391.33660888671875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.016952751204371452, "epoch": 0.8245192307692307, "frac_reward_zero_std": 0.375, "grad_norm": 0.004784613382071257, "learning_rate": 1.7560096153846152e-07, "loss": 0.0122, "num_tokens": 137844130.0, "reward": 0.7072263360023499, "reward_std": 0.3219653591513634, "rewards/reward_fn/mean": 0.7072263360023499, "rewards/reward_fn/std": 0.3219653517007828, "sampling/importance_sampling_ratio/max": 1.1803658306598663, "sampling/importance_sampling_ratio/mean": 0.5666937530040741, "sampling/importance_sampling_ratio/min": 0.002043343130253561, "sampling/sampling_logp_difference/max": 1.447906732559204, "sampling/sampling_logp_difference/mean": 0.003553066519089043, "step": 6860, "step_time": 6.650548468995839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3000.0, "completions/max_terminated_length": 1331.6666666666667, "completions/mean_length": 1051.15625, "completions/mean_terminated_length": 447.5577341715495, "completions/min_length": 131.33333333333334, "completions/min_terminated_length": 131.33333333333334, "entropy": 0.02375180572271347, "epoch": 0.8257211538461539, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.005059670191258192, "learning_rate": 1.7439903846153844e-07, "loss": -0.0103, "num_tokens": 138049305.0, "reward": 0.6987811923027039, "reward_std": 0.25697227815787, "rewards/reward_fn/mean": 0.6987811923027039, "rewards/reward_fn/std": 0.25697225828965503, "sampling/importance_sampling_ratio/max": 1.8562357823053997, "sampling/importance_sampling_ratio/mean": 0.5156234701474508, "sampling/importance_sampling_ratio/min": 0.00012483605719163884, "sampling/sampling_logp_difference/max": 3.175388594468435, "sampling/sampling_logp_difference/mean": 0.005593536266436179, "step": 6870, "step_time": 10.081178573332727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 3000.0, "completions/max_terminated_length": 2530.0, "completions/mean_length": 1724.53125, "completions/mean_terminated_length": 851.8999938964844, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.022577947936952114, "epoch": 0.8269230769230769, "frac_reward_zero_std": 0.0, "grad_norm": 0.0023898181971162558, "learning_rate": 1.7319711538461538e-07, "loss": -0.007, "num_tokens": 138248619.0, "reward": 0.5689750611782074, "reward_std": 0.32699495553970337, "rewards/reward_fn/mean": 0.5689750611782074, "rewards/reward_fn/std": 0.32699498534202576, "sampling/importance_sampling_ratio/max": 1.0864353775978088, "sampling/importance_sampling_ratio/mean": 0.24821648001670837, "sampling/importance_sampling_ratio/min": 7.968683917169983e-07, "sampling/sampling_logp_difference/max": 26.858050525188446, "sampling/sampling_logp_difference/mean": 0.005364488577470183, "step": 6880, "step_time": 7.245295461639762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3000.0, "completions/max_terminated_length": 2512.0, "completions/mean_length": 1238.71875, "completions/mean_terminated_length": 651.5307210286459, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.022375798597931863, "epoch": 0.828125, "frac_reward_zero_std": 0.0, "grad_norm": 0.006994019728153944, "learning_rate": 1.719951923076923e-07, "loss": -0.0042, "num_tokens": 138470432.0, "reward": 0.7616462707519531, "reward_std": 0.2303058256705602, "rewards/reward_fn/mean": 0.7616462707519531, "rewards/reward_fn/std": 0.2303058256705602, "sampling/importance_sampling_ratio/max": 1.3428867657979329, "sampling/importance_sampling_ratio/mean": 0.35622599720954895, "sampling/importance_sampling_ratio/min": 6.357276087480083e-06, "sampling/sampling_logp_difference/max": 3.0581253369649253, "sampling/sampling_logp_difference/mean": 0.005328538827598095, "step": 6890, "step_time": 10.342494682688265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1337.390625, "completions/mean_terminated_length": 523.0924072265625, "completions/min_length": 108.5, "completions/min_terminated_length": 108.5, "entropy": 0.021644866839051247, "epoch": 0.8293269230769231, "frac_reward_zero_std": 0.0, "grad_norm": 0.009319664910435677, "learning_rate": 1.7079326923076924e-07, "loss": -0.007, "num_tokens": 138634153.0, "reward": 0.7377616167068481, "reward_std": 0.26367828249931335, "rewards/reward_fn/mean": 0.7377616167068481, "rewards/reward_fn/std": 0.26367826759815216, "sampling/importance_sampling_ratio/max": 1.2890145182609558, "sampling/importance_sampling_ratio/mean": 0.35057297348976135, "sampling/importance_sampling_ratio/min": 4.07967218052363e-06, "sampling/sampling_logp_difference/max": 4.03126323223114, "sampling/sampling_logp_difference/mean": 0.005212919786572456, "step": 6900, "step_time": 7.0115896061062815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.16666666666666666, "completions/max_length": 3000.0, "completions/max_terminated_length": 1899.6666666666667, "completions/mean_length": 882.875, "completions/mean_terminated_length": 455.18092854817706, "completions/min_length": 134.33333333333334, "completions/min_terminated_length": 134.33333333333334, "entropy": 0.023920848779380322, "epoch": 0.8305288461538461, "frac_reward_zero_std": 0.0, "grad_norm": 0.004946565721184015, "learning_rate": 1.6959134615384615e-07, "loss": -0.0045, "num_tokens": 138822269.0, "reward": 0.7674394249916077, "reward_std": 0.2300974577665329, "rewards/reward_fn/mean": 0.7674394249916077, "rewards/reward_fn/std": 0.2300974577665329, "sampling/importance_sampling_ratio/max": 1.7507988611857097, "sampling/importance_sampling_ratio/mean": 0.46347623070081073, "sampling/importance_sampling_ratio/min": 0.00013760670359867314, "sampling/sampling_logp_difference/max": 2.038575013478597, "sampling/sampling_logp_difference/mean": 0.005629631069799264, "step": 6910, "step_time": 10.299252491537482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 3000.0, "completions/max_terminated_length": 1275.0, "completions/mean_length": 969.84375, "completions/mean_terminated_length": 329.3250045776367, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.024819427728652955, "epoch": 0.8317307692307693, "frac_reward_zero_std": 0.0, "grad_norm": 0.005877587012946606, "learning_rate": 1.6838942307692307e-07, "loss": 0.0044, "num_tokens": 138949507.0, "reward": 0.6930743753910065, "reward_std": 0.27168963849544525, "rewards/reward_fn/mean": 0.6930743753910065, "rewards/reward_fn/std": 0.27168963849544525, "sampling/importance_sampling_ratio/max": 1.7573692798614502, "sampling/importance_sampling_ratio/mean": 0.517643392086029, "sampling/importance_sampling_ratio/min": 0.012264674480320537, "sampling/sampling_logp_difference/max": 2.1143829822540283, "sampling/sampling_logp_difference/mean": 0.0047837437596172094, "step": 6920, "step_time": 6.943385070934892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 3000.0, "completions/max_terminated_length": 1842.6666666666667, "completions/mean_length": 802.5729166666666, "completions/mean_terminated_length": 488.7283020019531, "completions/min_length": 125.33333333333333, "completions/min_terminated_length": 125.33333333333333, "entropy": 0.027300281263887883, "epoch": 0.8329326923076923, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.009884782135486603, "learning_rate": 1.671875e-07, "loss": -0.0017, "num_tokens": 139130162.0, "reward": 0.7242677410443624, "reward_std": 0.28134160737196606, "rewards/reward_fn/mean": 0.7242677410443624, "rewards/reward_fn/std": 0.2813415974378586, "sampling/importance_sampling_ratio/max": 1.1730528871218364, "sampling/importance_sampling_ratio/mean": 0.36732662717501324, "sampling/importance_sampling_ratio/min": 0.00010661744924315523, "sampling/sampling_logp_difference/max": 1.8055496215820312, "sampling/sampling_logp_difference/mean": 0.005623308165619771, "step": 6930, "step_time": 10.07025023745373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3000.0, "completions/max_terminated_length": 1988.0, "completions/mean_length": 1190.109375, "completions/mean_terminated_length": 593.8129425048828, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.026267390139400958, "epoch": 0.8341346153846154, "frac_reward_zero_std": 0.0, "grad_norm": 0.015394563786685467, "learning_rate": 1.659855769230769e-07, "loss": -0.0056, "num_tokens": 139270273.0, "reward": 0.7244434952735901, "reward_std": 0.2563028857111931, "rewards/reward_fn/mean": 0.7244434952735901, "rewards/reward_fn/std": 0.2563028782606125, "sampling/importance_sampling_ratio/max": 1.2111260294914246, "sampling/importance_sampling_ratio/mean": 0.27310119569301605, "sampling/importance_sampling_ratio/min": 2.417215091554681e-05, "sampling/sampling_logp_difference/max": 2.3204939365386963, "sampling/sampling_logp_difference/mean": 0.005472195567563176, "step": 6940, "step_time": 7.057494845055044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2604166666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 2478.6666666666665, "completions/mean_length": 1257.3541666666667, "completions/mean_terminated_length": 640.6973266601562, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.026158127188682555, "epoch": 0.8353365384615384, "frac_reward_zero_std": 0.0, "grad_norm": 0.003256229916587472, "learning_rate": 1.6478365384615385e-07, "loss": -0.0032, "num_tokens": 139500483.0, "reward": 0.7464001377423605, "reward_std": 0.23952084283034006, "rewards/reward_fn/mean": 0.7464001377423605, "rewards/reward_fn/std": 0.23952083786328635, "sampling/importance_sampling_ratio/max": 1.302933931350708, "sampling/importance_sampling_ratio/mean": 0.2916339288155238, "sampling/importance_sampling_ratio/min": 0.00010241438862597836, "sampling/sampling_logp_difference/max": 3.654414256413778, "sampling/sampling_logp_difference/mean": 0.005411717730263869, "step": 6950, "step_time": 10.252696036733687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3000.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 1181.515625, "completions/mean_terminated_length": 569.2196044921875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.025300581753253937, "epoch": 0.8365384615384616, "frac_reward_zero_std": 0.0, "grad_norm": 0.006637209095060825, "learning_rate": 1.6358173076923076e-07, "loss": 0.0029, "num_tokens": 139651348.0, "reward": 0.7415344417095184, "reward_std": 0.24985411763191223, "rewards/reward_fn/mean": 0.7415344417095184, "rewards/reward_fn/std": 0.24985411018133163, "sampling/importance_sampling_ratio/max": 1.548153042793274, "sampling/importance_sampling_ratio/mean": 0.33220648765563965, "sampling/importance_sampling_ratio/min": 4.534776780928951e-05, "sampling/sampling_logp_difference/max": 2.9344053864479065, "sampling/sampling_logp_difference/mean": 0.005746126174926758, "step": 6960, "step_time": 7.08162454944104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 3000.0, "completions/max_terminated_length": 2146.3333333333335, "completions/mean_length": 1184.9791666666667, "completions/mean_terminated_length": 678.0535888671875, "completions/min_length": 148.33333333333334, "completions/min_terminated_length": 148.33333333333334, "entropy": 0.027664628624916077, "epoch": 0.8377403846153846, "frac_reward_zero_std": 0.0, "grad_norm": 0.01199318841099739, "learning_rate": 1.6237980769230768e-07, "loss": 0.0005, "num_tokens": 139871906.0, "reward": 0.7263395587603251, "reward_std": 0.26289120813210803, "rewards/reward_fn/mean": 0.7263395587603251, "rewards/reward_fn/std": 0.26289119323094684, "sampling/importance_sampling_ratio/max": 1.254350185394287, "sampling/importance_sampling_ratio/mean": 0.27342327932516736, "sampling/importance_sampling_ratio/min": 0.0003859160441000616, "sampling/sampling_logp_difference/max": 2.4300938844680786, "sampling/sampling_logp_difference/mean": 0.00588711987559994, "step": 6970, "step_time": 10.319141559116542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 3000.0, "completions/max_terminated_length": 783.5, "completions/mean_length": 707.390625, "completions/mean_terminated_length": 303.60099029541016, "completions/min_length": 83.5, "completions/min_terminated_length": 83.5, "entropy": 0.017835280671715737, "epoch": 0.8389423076923077, "frac_reward_zero_std": 0.25, "grad_norm": 0.03299025073647499, "learning_rate": 1.6117788461538462e-07, "loss": -0.032, "num_tokens": 139978795.0, "reward": 0.5013408660888672, "reward_std": 0.36606980860233307, "rewards/reward_fn/mean": 0.5013408660888672, "rewards/reward_fn/std": 0.36606980860233307, "sampling/importance_sampling_ratio/max": 2.038869023323059, "sampling/importance_sampling_ratio/mean": 0.6512362062931061, "sampling/importance_sampling_ratio/min": 0.0022410416931961663, "sampling/sampling_logp_difference/max": 2.2337872982025146, "sampling/sampling_logp_difference/mean": 0.003925522090867162, "step": 6980, "step_time": 6.711357829347253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 3000.0, "completions/max_terminated_length": 1922.3333333333333, "completions/mean_length": 713.8541666666666, "completions/mean_terminated_length": 391.337158203125, "completions/min_length": 125.66666666666667, "completions/min_terminated_length": 125.66666666666667, "entropy": 0.02294178996235132, "epoch": 0.8401442307692307, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.009418715722858906, "learning_rate": 1.599759615384615e-07, "loss": -0.0045, "num_tokens": 140157933.0, "reward": 0.7289254466692606, "reward_std": 0.27327637871106464, "rewards/reward_fn/mean": 0.7289254466692606, "rewards/reward_fn/std": 0.2732763687769572, "sampling/importance_sampling_ratio/max": 1.450897177060445, "sampling/importance_sampling_ratio/mean": 0.5224303901195526, "sampling/importance_sampling_ratio/min": 6.211037073929522e-05, "sampling/sampling_logp_difference/max": 1.4959852695465088, "sampling/sampling_logp_difference/mean": 0.0052755239109198255, "step": 6990, "step_time": 9.890141496807336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 3000.0, "completions/max_terminated_length": 2598.0, "completions/mean_length": 1411.28125, "completions/mean_terminated_length": 847.5199890136719, "completions/min_length": 172.5, "completions/min_terminated_length": 172.5, "entropy": 0.02729437258094549, "epoch": 0.8413461538461539, "frac_reward_zero_std": 0.0, "grad_norm": 0.006498276721686125, "learning_rate": 1.5877403846153845e-07, "loss": -0.0044, "num_tokens": 140326767.0, "reward": 0.71929931640625, "reward_std": 0.27038730680942535, "rewards/reward_fn/mean": 0.71929931640625, "rewards/reward_fn/std": 0.27038729935884476, "sampling/importance_sampling_ratio/max": 0.9340122044086456, "sampling/importance_sampling_ratio/mean": 0.1891511119902134, "sampling/importance_sampling_ratio/min": 0.0001323352796589461, "sampling/sampling_logp_difference/max": 1.8218629360198975, "sampling/sampling_logp_difference/mean": 0.006082174135372043, "step": 7000, "step_time": 7.2892820465378465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.13541666666666666, "completions/max_length": 3000.0, "completions/max_terminated_length": 2185.6666666666665, "completions/mean_length": 797.15625, "completions/mean_terminated_length": 460.5266825358073, "completions/min_length": 134.66666666666666, "completions/min_terminated_length": 134.66666666666666, "entropy": 0.020452067628502844, "epoch": 0.8425480769230769, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.02753128856420517, "learning_rate": 1.5757211538461537e-07, "loss": 0.0231, "num_tokens": 140558150.0, "reward": 0.7704573273658752, "reward_std": 0.2502277195453644, "rewards/reward_fn/mean": 0.7704573273658752, "rewards/reward_fn/std": 0.2502277195453644, "sampling/importance_sampling_ratio/max": 1.910150686899821, "sampling/importance_sampling_ratio/mean": 0.5112549463907877, "sampling/importance_sampling_ratio/min": 0.002093723393045366, "sampling/sampling_logp_difference/max": 2.0019710858662925, "sampling/sampling_logp_difference/mean": 0.005194158138086398, "step": 7010, "step_time": 10.720135370735079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2364.0, "completions/mean_length": 1401.015625, "completions/mean_terminated_length": 622.1028137207031, "completions/min_length": 121.5, "completions/min_terminated_length": 121.5, "entropy": 0.023916630260646343, "epoch": 0.84375, "frac_reward_zero_std": 0.0, "grad_norm": 0.007643390446901321, "learning_rate": 1.563701923076923e-07, "loss": -0.0068, "num_tokens": 140724327.0, "reward": 0.7107284665107727, "reward_std": 0.27183811366558075, "rewards/reward_fn/mean": 0.7107284665107727, "rewards/reward_fn/std": 0.27183809876441956, "sampling/importance_sampling_ratio/max": 1.865343689918518, "sampling/importance_sampling_ratio/mean": 0.31995999813079834, "sampling/importance_sampling_ratio/min": 4.910711595584871e-05, "sampling/sampling_logp_difference/max": 2.9729620218276978, "sampling/sampling_logp_difference/mean": 0.005223116837441921, "step": 7020, "step_time": 7.173264542035758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.17708333333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 2611.6666666666665, "completions/mean_length": 1071.2083333333333, "completions/mean_terminated_length": 663.8977864583334, "completions/min_length": 130.66666666666666, "completions/min_terminated_length": 130.66666666666666, "entropy": 0.0207345437258482, "epoch": 0.8449519230769231, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.006115437485277653, "learning_rate": 1.5516826923076923e-07, "loss": 0.0232, "num_tokens": 140919555.0, "reward": 0.6746413310368856, "reward_std": 0.3237893929084142, "rewards/reward_fn/mean": 0.6746413310368856, "rewards/reward_fn/std": 0.3237893879413605, "sampling/importance_sampling_ratio/max": 1.5096293687820435, "sampling/importance_sampling_ratio/mean": 0.3553849111000697, "sampling/importance_sampling_ratio/min": 0.0002486227313056588, "sampling/sampling_logp_difference/max": 2.073334733645121, "sampling/sampling_logp_difference/mean": 0.00495490524917841, "step": 7030, "step_time": 10.204659219831228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 3000.0, "completions/max_terminated_length": 2113.0, "completions/mean_length": 1126.28125, "completions/mean_terminated_length": 449.1750030517578, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.021373669896274806, "epoch": 0.8461538461538461, "frac_reward_zero_std": 0.0, "grad_norm": 0.0028994607273489237, "learning_rate": 1.5396634615384615e-07, "loss": -0.0014, "num_tokens": 141070733.0, "reward": 0.644044429063797, "reward_std": 0.3213326930999756, "rewards/reward_fn/mean": 0.644044429063797, "rewards/reward_fn/std": 0.3213326781988144, "sampling/importance_sampling_ratio/max": 1.6694517731666565, "sampling/importance_sampling_ratio/mean": 0.44352978467941284, "sampling/importance_sampling_ratio/min": 0.000152455385432404, "sampling/sampling_logp_difference/max": 3.0328989028930664, "sampling/sampling_logp_difference/mean": 0.004991092020645738, "step": 7040, "step_time": 7.025052619446069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.17708333333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 2302.0, "completions/mean_length": 1045.2708333333333, "completions/mean_terminated_length": 623.7833251953125, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.02743039298802614, "epoch": 0.8473557692307693, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.008150446228682995, "learning_rate": 1.527644230769231e-07, "loss": -0.0048, "num_tokens": 141284167.0, "reward": 0.7893303036689758, "reward_std": 0.2367378423611323, "rewards/reward_fn/mean": 0.7893303036689758, "rewards/reward_fn/std": 0.23673781752586365, "sampling/importance_sampling_ratio/max": 1.3379480441411336, "sampling/importance_sampling_ratio/mean": 0.3876957098642985, "sampling/importance_sampling_ratio/min": 8.943581148438777e-05, "sampling/sampling_logp_difference/max": 2.104866941769918, "sampling/sampling_logp_difference/mean": 0.005616623908281326, "step": 7050, "step_time": 10.180274306237697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 3000.0, "completions/max_terminated_length": 748.5, "completions/mean_length": 593.359375, "completions/mean_terminated_length": 246.173095703125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.017240798100829124, "epoch": 0.8485576923076923, "frac_reward_zero_std": 0.25, "grad_norm": 0.015097892843186855, "learning_rate": 1.5156249999999998e-07, "loss": -0.0031, "num_tokens": 141377646.0, "reward": 0.7395722568035126, "reward_std": 0.28498321026563644, "rewards/reward_fn/mean": 0.7395722568035126, "rewards/reward_fn/std": 0.28498321026563644, "sampling/importance_sampling_ratio/max": 1.4806137084960938, "sampling/importance_sampling_ratio/mean": 0.6586013734340668, "sampling/importance_sampling_ratio/min": 0.0001571948357650399, "sampling/sampling_logp_difference/max": 2.1133057475090027, "sampling/sampling_logp_difference/mean": 0.003638960770331323, "step": 7060, "step_time": 6.769774859119207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3645833333333333, "completions/max_length": 3000.0, "completions/max_terminated_length": 1922.0, "completions/mean_length": 1508.7916666666667, "completions/mean_terminated_length": 652.7869771321615, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.026028821617364882, "epoch": 0.8497596153846154, "frac_reward_zero_std": 0.0, "grad_norm": 0.00721743144094944, "learning_rate": 1.5036057692307692e-07, "loss": -0.0056, "num_tokens": 141621434.0, "reward": 0.703334112962087, "reward_std": 0.2754689007997513, "rewards/reward_fn/mean": 0.703334112962087, "rewards/reward_fn/std": 0.27546889583269757, "sampling/importance_sampling_ratio/max": 1.6062078873316448, "sampling/importance_sampling_ratio/mean": 0.30664120117823285, "sampling/importance_sampling_ratio/min": 2.10207966423089e-05, "sampling/sampling_logp_difference/max": 2.7268288930257163, "sampling/sampling_logp_difference/mean": 0.0052969080085555715, "step": 7070, "step_time": 10.33065646244213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 3000.0, "completions/max_terminated_length": 2229.5, "completions/mean_length": 852.65625, "completions/mean_terminated_length": 747.8016052246094, "completions/min_length": 136.5, "completions/min_terminated_length": 136.5, "entropy": 0.022856530267745256, "epoch": 0.8509615384615384, "frac_reward_zero_std": 0.125, "grad_norm": 0.013145776465535164, "learning_rate": 1.4915865384615384e-07, "loss": 0.005, "num_tokens": 141743332.0, "reward": 0.6879273355007172, "reward_std": 0.3046773001551628, "rewards/reward_fn/mean": 0.6879273355007172, "rewards/reward_fn/std": 0.304677277803421, "sampling/importance_sampling_ratio/max": 1.2135862112045288, "sampling/importance_sampling_ratio/mean": 0.3749777227640152, "sampling/importance_sampling_ratio/min": 0.001561797179192581, "sampling/sampling_logp_difference/max": 2.3152116537094116, "sampling/sampling_logp_difference/mean": 0.00523552147205919, "step": 7080, "step_time": 6.884765348955989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.23958333333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 1523.6666666666667, "completions/mean_length": 1041.8229166666667, "completions/mean_terminated_length": 437.5574645996094, "completions/min_length": 118.66666666666667, "completions/min_terminated_length": 118.66666666666667, "entropy": 0.02479841560125351, "epoch": 0.8521634615384616, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.009908604435622692, "learning_rate": 1.4795673076923075e-07, "loss": 0.0013, "num_tokens": 141954523.0, "reward": 0.6527093648910522, "reward_std": 0.29801882803440094, "rewards/reward_fn/mean": 0.6527093648910522, "rewards/reward_fn/std": 0.29801881810029346, "sampling/importance_sampling_ratio/max": 1.7163775364557903, "sampling/importance_sampling_ratio/mean": 0.40628668665885925, "sampling/importance_sampling_ratio/min": 3.1419905667462444e-05, "sampling/sampling_logp_difference/max": 1.9496260086695354, "sampling/sampling_logp_difference/mean": 0.005164495358864467, "step": 7090, "step_time": 10.363675345666707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2156.0, "completions/mean_length": 1373.5625, "completions/mean_terminated_length": 437.3626708984375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.019629417918622494, "epoch": 0.8533653846153846, "frac_reward_zero_std": 0.125, "grad_norm": 0.015616143122315407, "learning_rate": 1.467548076923077e-07, "loss": -0.0039, "num_tokens": 142127055.0, "reward": 0.6480883061885834, "reward_std": 0.3020952343940735, "rewards/reward_fn/mean": 0.6480883061885834, "rewards/reward_fn/std": 0.3020952194929123, "sampling/importance_sampling_ratio/max": 1.0675289034843445, "sampling/importance_sampling_ratio/mean": 0.3725214749574661, "sampling/importance_sampling_ratio/min": 1.3322036124918668e-05, "sampling/sampling_logp_difference/max": 2.7631804943084717, "sampling/sampling_logp_difference/mean": 0.0039114567916840315, "step": 7100, "step_time": 7.150250324886292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 3000.0, "completions/max_terminated_length": 2174.3333333333335, "completions/mean_length": 1004.7708333333334, "completions/mean_terminated_length": 443.21045939127606, "completions/min_length": 110.33333333333333, "completions/min_terminated_length": 110.33333333333333, "entropy": 0.02404818367213011, "epoch": 0.8545673076923077, "frac_reward_zero_std": 0.0, "grad_norm": 0.027475040405988693, "learning_rate": 1.455528846153846e-07, "loss": 0.0062, "num_tokens": 142333385.0, "reward": 0.7115860184033712, "reward_std": 0.2680665999650955, "rewards/reward_fn/mean": 0.7115860184033712, "rewards/reward_fn/std": 0.26806660493214923, "sampling/importance_sampling_ratio/max": 2.31654687722524, "sampling/importance_sampling_ratio/mean": 0.4543142418066661, "sampling/importance_sampling_ratio/min": 2.4932893817701068e-05, "sampling/sampling_logp_difference/max": 1.779450535774231, "sampling/sampling_logp_difference/mean": 0.005373761213074128, "step": 7110, "step_time": 10.381402310263365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2813.5, "completions/mean_length": 1214.8125, "completions/mean_terminated_length": 517.0056762695312, "completions/min_length": 118.5, "completions/min_terminated_length": 118.5, "entropy": 0.025068867765367032, "epoch": 0.8557692307692307, "frac_reward_zero_std": 0.0, "grad_norm": 0.009670455008745193, "learning_rate": 1.4435096153846153e-07, "loss": 0.0014, "num_tokens": 142496973.0, "reward": 0.7383133172988892, "reward_std": 0.24622797220945358, "rewards/reward_fn/mean": 0.7383133172988892, "rewards/reward_fn/std": 0.24622796475887299, "sampling/importance_sampling_ratio/max": 1.0540346503257751, "sampling/importance_sampling_ratio/mean": 0.3261059522628784, "sampling/importance_sampling_ratio/min": 1.194332344311988e-05, "sampling/sampling_logp_difference/max": 1.978637158870697, "sampling/sampling_logp_difference/mean": 0.0056684319861233234, "step": 7120, "step_time": 7.270607881899923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3020833333333333, "completions/max_length": 3000.0, "completions/max_terminated_length": 1762.6666666666667, "completions/mean_length": 1322.3854166666667, "completions/mean_terminated_length": 617.02880859375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.028121387399733067, "epoch": 0.8569711538461539, "frac_reward_zero_std": 0.0, "grad_norm": 0.026205750182271004, "learning_rate": 1.4314903846153845e-07, "loss": 0.0012, "num_tokens": 142725994.0, "reward": 0.7003626823425293, "reward_std": 0.2662431299686432, "rewards/reward_fn/mean": 0.7003626823425293, "rewards/reward_fn/std": 0.2662431299686432, "sampling/importance_sampling_ratio/max": 1.5517892440160115, "sampling/importance_sampling_ratio/mean": 0.276486208041509, "sampling/importance_sampling_ratio/min": 0.0005315304936364859, "sampling/sampling_logp_difference/max": 1.7582488854726155, "sampling/sampling_logp_difference/mean": 0.005891657279183467, "step": 7130, "step_time": 10.320868784375488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 3000.0, "completions/max_terminated_length": 1332.5, "completions/mean_length": 1133.609375, "completions/mean_terminated_length": 454.68182373046875, "completions/min_length": 120.5, "completions/min_terminated_length": 120.5, "entropy": 0.025892592035233974, "epoch": 0.8581730769230769, "frac_reward_zero_std": 0.0, "grad_norm": 0.04400388151407242, "learning_rate": 1.4194711538461536e-07, "loss": -0.0062, "num_tokens": 143009097.0, "reward": 0.7383115291595459, "reward_std": 0.26266905665397644, "rewards/reward_fn/mean": 0.7383115291595459, "rewards/reward_fn/std": 0.26266905665397644, "sampling/importance_sampling_ratio/max": 1.7800341546535492, "sampling/importance_sampling_ratio/mean": 0.3348439186811447, "sampling/importance_sampling_ratio/min": 0.00011782642741309246, "sampling/sampling_logp_difference/max": 2.7103887796401978, "sampling/sampling_logp_difference/mean": 0.005683869123458862, "step": 7140, "step_time": 9.325115243438631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2708333333333333, "completions/max_length": 3000.0, "completions/max_terminated_length": 1570.6666666666667, "completions/mean_length": 1080.8645833333333, "completions/mean_terminated_length": 370.75762430826825, "completions/min_length": 116.66666666666667, "completions/min_terminated_length": 116.66666666666667, "entropy": 0.019817824568599464, "epoch": 0.859375, "frac_reward_zero_std": 0.16666666666666666, "grad_norm": 0.014396346174180508, "learning_rate": 1.407451923076923e-07, "loss": 0.0057, "num_tokens": 143219948.0, "reward": 0.7079891363779703, "reward_std": 0.30350034435590106, "rewards/reward_fn/mean": 0.7079891363779703, "rewards/reward_fn/std": 0.30350033442179364, "sampling/importance_sampling_ratio/max": 1.5575803915659587, "sampling/importance_sampling_ratio/mean": 0.5604627231756846, "sampling/importance_sampling_ratio/min": 0.00039309893846469396, "sampling/sampling_logp_difference/max": 1.7176295121510823, "sampling/sampling_logp_difference/mean": 0.004464488010853529, "step": 7150, "step_time": 10.451376772485673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2434.0, "completions/mean_length": 1424.6875, "completions/mean_terminated_length": 599.5238342285156, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.023850650154054166, "epoch": 0.8605769230769231, "frac_reward_zero_std": 0.0, "grad_norm": 0.011180352419614792, "learning_rate": 1.3954326923076922e-07, "loss": -0.0003, "num_tokens": 143390504.0, "reward": 0.6721096038818359, "reward_std": 0.2831665277481079, "rewards/reward_fn/mean": 0.6721096038818359, "rewards/reward_fn/std": 0.2831665128469467, "sampling/importance_sampling_ratio/max": 1.9453532099723816, "sampling/importance_sampling_ratio/mean": 0.3666405975818634, "sampling/importance_sampling_ratio/min": 4.491966478781251e-07, "sampling/sampling_logp_difference/max": 2.5860283970832825, "sampling/sampling_logp_difference/mean": 0.004953037481755018, "step": 7160, "step_time": 7.28081453666091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2040.3333333333333, "completions/mean_length": 630.5729166666666, "completions/mean_terminated_length": 384.3497009277344, "completions/min_length": 113.33333333333333, "completions/min_terminated_length": 113.33333333333333, "entropy": 0.026350433006882668, "epoch": 0.8617788461538461, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.016798973083496094, "learning_rate": 1.3834134615384616e-07, "loss": 0.0002, "num_tokens": 143597167.0, "reward": 0.8320521116256714, "reward_std": 0.18711299200852713, "rewards/reward_fn/mean": 0.8320521116256714, "rewards/reward_fn/std": 0.18711300194263458, "sampling/importance_sampling_ratio/max": 1.7719039519627888, "sampling/importance_sampling_ratio/mean": 0.532355546951294, "sampling/importance_sampling_ratio/min": 3.748395829461515e-05, "sampling/sampling_logp_difference/max": 3.4911025365193686, "sampling/sampling_logp_difference/mean": 0.005800487473607063, "step": 7170, "step_time": 10.21147453384474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 3000.0, "completions/max_terminated_length": 2018.5, "completions/mean_length": 850.109375, "completions/mean_terminated_length": 454.40248107910156, "completions/min_length": 138.5, "completions/min_terminated_length": 138.5, "entropy": 0.022826461866497994, "epoch": 0.8629807692307693, "frac_reward_zero_std": 0.125, "grad_norm": 0.0070105246268212795, "learning_rate": 1.3713942307692308e-07, "loss": -0.0093, "num_tokens": 143722822.0, "reward": 0.6848182380199432, "reward_std": 0.33193013072013855, "rewards/reward_fn/mean": 0.6848182380199432, "rewards/reward_fn/std": 0.33193013072013855, "sampling/importance_sampling_ratio/max": 1.2320035696029663, "sampling/importance_sampling_ratio/mean": 0.43929511308670044, "sampling/importance_sampling_ratio/min": 3.4024689739453606e-05, "sampling/sampling_logp_difference/max": 5.008056640625, "sampling/sampling_logp_difference/mean": 0.004582610679790378, "step": 7180, "step_time": 6.926064229104668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 3000.0, "completions/max_terminated_length": 2209.6666666666665, "completions/mean_length": 952.09375, "completions/mean_terminated_length": 473.71142578125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.023155865259468555, "epoch": 0.8641826923076923, "frac_reward_zero_std": 0.16666666666666666, "grad_norm": 0.005552010145038366, "learning_rate": 1.359375e-07, "loss": -0.0024, "num_tokens": 143918399.0, "reward": 0.7133696873982748, "reward_std": 0.2768748849630356, "rewards/reward_fn/mean": 0.7133696873982748, "rewards/reward_fn/std": 0.2768748849630356, "sampling/importance_sampling_ratio/max": 1.2597923080126445, "sampling/importance_sampling_ratio/mean": 0.4463168978691101, "sampling/importance_sampling_ratio/min": 0.0002556569428027918, "sampling/sampling_logp_difference/max": 2.0260831912358603, "sampling/sampling_logp_difference/mean": 0.005346188632150491, "step": 7190, "step_time": 9.977038684673607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 3000.0, "completions/max_terminated_length": 1488.5, "completions/mean_length": 772.5625, "completions/mean_terminated_length": 404.44710540771484, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.020976117067039012, "epoch": 0.8653846153846154, "frac_reward_zero_std": 0.25, "grad_norm": 0.00827870611101389, "learning_rate": 1.347355769230769e-07, "loss": -0.0037, "num_tokens": 144027171.0, "reward": 0.6120414733886719, "reward_std": 0.40142250061035156, "rewards/reward_fn/mean": 0.6120414733886719, "rewards/reward_fn/std": 0.40142250061035156, "sampling/importance_sampling_ratio/max": 1.3726723790168762, "sampling/importance_sampling_ratio/mean": 0.5367662608623505, "sampling/importance_sampling_ratio/min": 9.899815404423862e-05, "sampling/sampling_logp_difference/max": 1.5909104943275452, "sampling/sampling_logp_difference/mean": 0.004545321105979383, "step": 7200, "step_time": 6.61954239429906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2260.6666666666665, "completions/mean_length": 1260.3333333333333, "completions/mean_terminated_length": 572.0771891276041, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.025327102839946748, "epoch": 0.8665865384615384, "frac_reward_zero_std": 0.0, "grad_norm": 0.0041866302490234375, "learning_rate": 1.3353365384615383e-07, "loss": -0.0025, "num_tokens": 144273835.0, "reward": 0.7297109166781107, "reward_std": 0.2555530120929082, "rewards/reward_fn/mean": 0.7297109166781107, "rewards/reward_fn/std": 0.2555530120929082, "sampling/importance_sampling_ratio/max": 1.3716166019439697, "sampling/importance_sampling_ratio/mean": 0.311924546957016, "sampling/importance_sampling_ratio/min": 4.220594765532345e-05, "sampling/sampling_logp_difference/max": 1.795520265897115, "sampling/sampling_logp_difference/mean": 0.005765737189600865, "step": 7210, "step_time": 10.402357399649919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2367.0, "completions/mean_length": 1264.8125, "completions/mean_terminated_length": 585.2173156738281, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.02231349814683199, "epoch": 0.8677884615384616, "frac_reward_zero_std": 0.0, "grad_norm": 0.020502500236034393, "learning_rate": 1.3233173076923077e-07, "loss": -0.0051, "num_tokens": 144420759.0, "reward": 0.6928066909313202, "reward_std": 0.29839497804641724, "rewards/reward_fn/mean": 0.6928066909313202, "rewards/reward_fn/std": 0.29839496314525604, "sampling/importance_sampling_ratio/max": 1.8020574450492859, "sampling/importance_sampling_ratio/mean": 0.42663076519966125, "sampling/importance_sampling_ratio/min": 0.0001277647094468648, "sampling/sampling_logp_difference/max": 2.2671627402305603, "sampling/sampling_logp_difference/mean": 0.0052947900258004665, "step": 7220, "step_time": 6.920958604756743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2708333333333333, "completions/max_length": 3000.0, "completions/max_terminated_length": 2522.6666666666665, "completions/mean_length": 1215.59375, "completions/mean_terminated_length": 558.5038146972656, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.021093549393117427, "epoch": 0.8689903846153846, "frac_reward_zero_std": 0.0, "grad_norm": 0.022343173623085022, "learning_rate": 1.311298076923077e-07, "loss": -0.0063, "num_tokens": 144640760.0, "reward": 0.6885436574618021, "reward_std": 0.26473474502563477, "rewards/reward_fn/mean": 0.6885436574618021, "rewards/reward_fn/std": 0.26473474005858105, "sampling/importance_sampling_ratio/max": 1.5448542435963948, "sampling/importance_sampling_ratio/mean": 0.3782164653142293, "sampling/importance_sampling_ratio/min": 2.224972924598963e-05, "sampling/sampling_logp_difference/max": 3.704509695370992, "sampling/sampling_logp_difference/mean": 0.0044762147590518, "step": 7230, "step_time": 10.32482622358948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 3000.0, "completions/max_terminated_length": 1671.5, "completions/mean_length": 1161.625, "completions/mean_terminated_length": 425.1383972167969, "completions/min_length": 129.5, "completions/min_terminated_length": 129.5, "entropy": 0.024493516609072685, "epoch": 0.8701923076923077, "frac_reward_zero_std": 0.0, "grad_norm": 0.012551619671285152, "learning_rate": 1.2992788461538463e-07, "loss": -0.0036, "num_tokens": 144800304.0, "reward": 0.7131195664405823, "reward_std": 0.27115075290203094, "rewards/reward_fn/mean": 0.7131195664405823, "rewards/reward_fn/std": 0.27115075290203094, "sampling/importance_sampling_ratio/max": 1.2946996092796326, "sampling/importance_sampling_ratio/mean": 0.3128240704536438, "sampling/importance_sampling_ratio/min": 6.882919791451059e-06, "sampling/sampling_logp_difference/max": 5.403037428855896, "sampling/sampling_logp_difference/mean": 0.0059490627609193325, "step": 7240, "step_time": 7.111913559678942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 3000.0, "completions/max_terminated_length": 1700.0, "completions/mean_length": 901.5625, "completions/mean_terminated_length": 418.84857177734375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.02367508690804243, "epoch": 0.8713942307692307, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.01318963710218668, "learning_rate": 1.2872596153846152e-07, "loss": -0.0029, "num_tokens": 145008614.0, "reward": 0.6884280840555826, "reward_std": 0.3227936625480652, "rewards/reward_fn/mean": 0.6884280840555826, "rewards/reward_fn/std": 0.3227936426798503, "sampling/importance_sampling_ratio/max": 1.709139625231425, "sampling/importance_sampling_ratio/mean": 0.4389911890029907, "sampling/importance_sampling_ratio/min": 7.779411983695657e-05, "sampling/sampling_logp_difference/max": 1.441325267155965, "sampling/sampling_logp_difference/mean": 0.005705118955423434, "step": 7250, "step_time": 10.036273902095854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 3000.0, "completions/max_terminated_length": 2136.5, "completions/mean_length": 1130.265625, "completions/mean_terminated_length": 606.739990234375, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.026729826629161835, "epoch": 0.8725961538461539, "frac_reward_zero_std": 0.0, "grad_norm": 0.010414425283670425, "learning_rate": 1.2752403846153844e-07, "loss": -0.0084, "num_tokens": 145151191.0, "reward": 0.7720150947570801, "reward_std": 0.23459161072969437, "rewards/reward_fn/mean": 0.7720150947570801, "rewards/reward_fn/std": 0.23459160327911377, "sampling/importance_sampling_ratio/max": 0.9709917306900024, "sampling/importance_sampling_ratio/mean": 0.26327580213546753, "sampling/importance_sampling_ratio/min": 2.858097377611557e-05, "sampling/sampling_logp_difference/max": 2.4270983934402466, "sampling/sampling_logp_difference/mean": 0.005885370075702667, "step": 7260, "step_time": 7.147565714456141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.16666666666666666, "completions/max_length": 3000.0, "completions/max_terminated_length": 1906.6666666666667, "completions/mean_length": 945.375, "completions/mean_terminated_length": 528.7210388183594, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.027827671729028225, "epoch": 0.8737980769230769, "frac_reward_zero_std": 0.0, "grad_norm": 0.020693274214863777, "learning_rate": 1.2632211538461538e-07, "loss": 0.0007, "num_tokens": 145338459.0, "reward": 0.8139833609263102, "reward_std": 0.19593979914983115, "rewards/reward_fn/mean": 0.8139833609263102, "rewards/reward_fn/std": 0.19593978424866995, "sampling/importance_sampling_ratio/max": 1.5343926747639973, "sampling/importance_sampling_ratio/mean": 0.38174991806348163, "sampling/importance_sampling_ratio/min": 0.0008506661106366664, "sampling/sampling_logp_difference/max": 3.33055579662323, "sampling/sampling_logp_difference/mean": 0.0056042370075980825, "step": 7270, "step_time": 9.95124967508018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 3000.0, "completions/max_terminated_length": 1676.0, "completions/mean_length": 755.359375, "completions/mean_terminated_length": 434.6964416503906, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.026197315007448197, "epoch": 0.875, "frac_reward_zero_std": 0.0, "grad_norm": 0.011007760651409626, "learning_rate": 1.251201923076923e-07, "loss": -0.0156, "num_tokens": 145466778.0, "reward": 0.8203740417957306, "reward_std": 0.188389353454113, "rewards/reward_fn/mean": 0.8203740417957306, "rewards/reward_fn/std": 0.18838933110237122, "sampling/importance_sampling_ratio/max": 1.989652693271637, "sampling/importance_sampling_ratio/mean": 0.4626861959695816, "sampling/importance_sampling_ratio/min": 3.5880277664546156e-06, "sampling/sampling_logp_difference/max": 2.429154336452484, "sampling/sampling_logp_difference/mean": 0.006373028038069606, "step": 7280, "step_time": 6.74479318484664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 2559.3333333333335, "completions/max_terminated_length": 1980.3333333333333, "completions/mean_length": 906.1875, "completions/mean_terminated_length": 534.1590576171875, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.022719074785709382, "epoch": 0.8762019230769231, "frac_reward_zero_std": 0.0, "grad_norm": 0.018247712403535843, "learning_rate": 1.2391826923076924e-07, "loss": 0.0034, "num_tokens": 145648972.0, "reward": 0.7381961345672607, "reward_std": 0.23285519083340964, "rewards/reward_fn/mean": 0.7381961345672607, "rewards/reward_fn/std": 0.2328551784157753, "sampling/importance_sampling_ratio/max": 1.966196616490682, "sampling/importance_sampling_ratio/mean": 0.44618910551071167, "sampling/importance_sampling_ratio/min": 0.0011634085282518452, "sampling/sampling_logp_difference/max": 2.070026954015096, "sampling/sampling_logp_difference/mean": 0.00560174851367871, "step": 7290, "step_time": 8.786015594191849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 3000.0, "completions/max_terminated_length": 1849.5, "completions/mean_length": 1242.125, "completions/mean_terminated_length": 436.3416748046875, "completions/min_length": 135.5, "completions/min_terminated_length": 135.5, "entropy": 0.02529150154441595, "epoch": 0.8774038461538461, "frac_reward_zero_std": 0.0, "grad_norm": 0.034544117748737335, "learning_rate": 1.2271634615384615e-07, "loss": 0.0048, "num_tokens": 145801148.0, "reward": 0.6682203114032745, "reward_std": 0.3017227202653885, "rewards/reward_fn/mean": 0.6682203114032745, "rewards/reward_fn/std": 0.3017226904630661, "sampling/importance_sampling_ratio/max": 1.5302539467811584, "sampling/importance_sampling_ratio/mean": 0.3936443477869034, "sampling/importance_sampling_ratio/min": 1.4466681477642851e-05, "sampling/sampling_logp_difference/max": 1.9679373502731323, "sampling/sampling_logp_difference/mean": 0.005189862800762057, "step": 7300, "step_time": 7.034900940768421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.14583333333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 2218.0, "completions/mean_length": 916.9166666666666, "completions/mean_terminated_length": 560.8864339192709, "completions/min_length": 107.66666666666667, "completions/min_terminated_length": 107.66666666666667, "entropy": 0.021069722715765238, "epoch": 0.8786057692307693, "frac_reward_zero_std": 0.16666666666666666, "grad_norm": 0.007357918191701174, "learning_rate": 1.2151442307692307e-07, "loss": -0.0008, "num_tokens": 145992780.0, "reward": 0.7220484217007955, "reward_std": 0.2795896331469218, "rewards/reward_fn/mean": 0.7220484217007955, "rewards/reward_fn/std": 0.2795896381139755, "sampling/importance_sampling_ratio/max": 1.7592331171035767, "sampling/importance_sampling_ratio/mean": 0.5018013517061869, "sampling/importance_sampling_ratio/min": 8.196660443597163e-05, "sampling/sampling_logp_difference/max": 1.618666172027588, "sampling/sampling_logp_difference/mean": 0.004536938077459733, "step": 7310, "step_time": 10.069900226220488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 3000.0, "completions/max_terminated_length": 1694.5, "completions/mean_length": 1192.578125, "completions/mean_terminated_length": 542.5333557128906, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "entropy": 0.024089214764535426, "epoch": 0.8798076923076923, "frac_reward_zero_std": 0.0, "grad_norm": 0.0017158495029434562, "learning_rate": 1.2031249999999999e-07, "loss": -0.0023, "num_tokens": 146144913.0, "reward": 0.7730247974395752, "reward_std": 0.21741057932376862, "rewards/reward_fn/mean": 0.7730247974395752, "rewards/reward_fn/std": 0.21741056442260742, "sampling/importance_sampling_ratio/max": 1.6539283394813538, "sampling/importance_sampling_ratio/mean": 0.3174494802951813, "sampling/importance_sampling_ratio/min": 2.7638634492177516e-05, "sampling/sampling_logp_difference/max": 4.700946807861328, "sampling/sampling_logp_difference/mean": 0.005403982475399971, "step": 7320, "step_time": 7.079006991535425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 2653.6666666666665, "completions/max_terminated_length": 1866.0, "completions/mean_length": 1225.4166666666667, "completions/mean_terminated_length": 539.4196065266927, "completions/min_length": 121.33333333333333, "completions/min_terminated_length": 121.33333333333333, "entropy": 0.024227752164006234, "epoch": 0.8810096153846154, "frac_reward_zero_std": 0.0, "grad_norm": 0.016327090561389923, "learning_rate": 1.1911057692307692e-07, "loss": -0.0013, "num_tokens": 146356089.0, "reward": 0.7057096163431803, "reward_std": 0.27423179149627686, "rewards/reward_fn/mean": 0.7057096163431803, "rewards/reward_fn/std": 0.27423180143038434, "sampling/importance_sampling_ratio/max": 1.7999670505523682, "sampling/importance_sampling_ratio/mean": 0.368155096968015, "sampling/importance_sampling_ratio/min": 0.004226351017374934, "sampling/sampling_logp_difference/max": 2.3677918910980225, "sampling/sampling_logp_difference/mean": 0.00565147368858258, "step": 7330, "step_time": 8.979186133760958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 3000.0, "completions/max_terminated_length": 2381.5, "completions/mean_length": 1628.71875, "completions/mean_terminated_length": 690.4736633300781, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.022008861228823662, "epoch": 0.8822115384615384, "frac_reward_zero_std": 0.0, "grad_norm": 0.022257348522543907, "learning_rate": 1.1790865384615383e-07, "loss": 0.0048, "num_tokens": 146548495.0, "reward": 0.7025917768478394, "reward_std": 0.2628357335925102, "rewards/reward_fn/mean": 0.7025917768478394, "rewards/reward_fn/std": 0.2628357335925102, "sampling/importance_sampling_ratio/max": 1.1364530324935913, "sampling/importance_sampling_ratio/mean": 0.22308771312236786, "sampling/importance_sampling_ratio/min": 6.990054407651769e-05, "sampling/sampling_logp_difference/max": 2.2162042260169983, "sampling/sampling_logp_difference/mean": 0.004741972545161843, "step": 7340, "step_time": 7.128409451246261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.17708333333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 2035.6666666666667, "completions/mean_length": 936.6458333333334, "completions/mean_terminated_length": 491.7971903483073, "completions/min_length": 126.66666666666667, "completions/min_terminated_length": 126.66666666666667, "entropy": 0.025309059582650662, "epoch": 0.8834134615384616, "frac_reward_zero_std": 0.0, "grad_norm": 0.008844354189932346, "learning_rate": 1.1670673076923076e-07, "loss": 0.0008, "num_tokens": 146790877.0, "reward": 0.7890325784683228, "reward_std": 0.21570106347401938, "rewards/reward_fn/mean": 0.7890325784683228, "rewards/reward_fn/std": 0.21570107837518057, "sampling/importance_sampling_ratio/max": 1.408567190170288, "sampling/importance_sampling_ratio/mean": 0.3838229974110921, "sampling/importance_sampling_ratio/min": 4.353364553821848e-05, "sampling/sampling_logp_difference/max": 3.713716705640157, "sampling/sampling_logp_difference/mean": 0.005471164205422004, "step": 7350, "step_time": 10.353495687432588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 3000.0, "completions/max_terminated_length": 2166.5, "completions/mean_length": 957.125, "completions/mean_terminated_length": 485.69232177734375, "completions/min_length": 141.5, "completions/min_terminated_length": 141.5, "entropy": 0.0220493471249938, "epoch": 0.8846153846153846, "frac_reward_zero_std": 0.125, "grad_norm": 0.006346854846924543, "learning_rate": 1.1550480769230769e-07, "loss": -0.0035, "num_tokens": 146922061.0, "reward": 0.6704209744930267, "reward_std": 0.2974829226732254, "rewards/reward_fn/mean": 0.6704209744930267, "rewards/reward_fn/std": 0.2974829226732254, "sampling/importance_sampling_ratio/max": 1.4836397171020508, "sampling/importance_sampling_ratio/mean": 0.3989153653383255, "sampling/importance_sampling_ratio/min": 0.00011773333676501352, "sampling/sampling_logp_difference/max": 3.383870005607605, "sampling/sampling_logp_difference/mean": 0.004874128382652998, "step": 7360, "step_time": 7.184011770412326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.19791666666666666, "completions/max_length": 3000.0, "completions/max_terminated_length": 1836.0, "completions/mean_length": 956.4270833333334, "completions/mean_terminated_length": 453.0601298014323, "completions/min_length": 110.33333333333333, "completions/min_terminated_length": 110.33333333333333, "entropy": 0.02488778382539749, "epoch": 0.8858173076923077, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.010740367695689201, "learning_rate": 1.1430288461538461e-07, "loss": 0.0057, "num_tokens": 147127926.0, "reward": 0.7105303605397543, "reward_std": 0.28731875618298847, "rewards/reward_fn/mean": 0.7105303605397543, "rewards/reward_fn/std": 0.28731875121593475, "sampling/importance_sampling_ratio/max": 1.7253244717915852, "sampling/importance_sampling_ratio/mean": 0.47907036542892456, "sampling/importance_sampling_ratio/min": 0.0002220119466376976, "sampling/sampling_logp_difference/max": 3.7054726282755532, "sampling/sampling_logp_difference/mean": 0.004770749559005101, "step": 7370, "step_time": 10.323745242133736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 3000.0, "completions/max_terminated_length": 1600.5, "completions/mean_length": 1131.421875, "completions/mean_terminated_length": 399.9288635253906, "completions/min_length": 125.5, "completions/min_terminated_length": 125.5, "entropy": 0.0209888381883502, "epoch": 0.8870192307692307, "frac_reward_zero_std": 0.0, "grad_norm": 0.011542744934558868, "learning_rate": 1.1310096153846154e-07, "loss": -0.0005, "num_tokens": 147340193.0, "reward": 0.7511697709560394, "reward_std": 0.2294655293226242, "rewards/reward_fn/mean": 0.7511697709560394, "rewards/reward_fn/std": 0.2294655218720436, "sampling/importance_sampling_ratio/max": 1.7069790959358215, "sampling/importance_sampling_ratio/mean": 0.4306347668170929, "sampling/importance_sampling_ratio/min": 5.036226230004104e-05, "sampling/sampling_logp_difference/max": 2.3424586057662964, "sampling/sampling_logp_difference/mean": 0.005019746720790863, "step": 7380, "step_time": 7.8275963364169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3000.0, "completions/max_terminated_length": 2588.3333333333335, "completions/mean_length": 1333.4375, "completions/mean_terminated_length": 777.7333170572916, "completions/min_length": 157.66666666666666, "completions/min_terminated_length": 157.66666666666666, "entropy": 0.024680630676448344, "epoch": 0.8882211538461539, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.013972589746117592, "learning_rate": 1.1189903846153847e-07, "loss": 0.0032, "num_tokens": 147580355.0, "reward": 0.7427690227826437, "reward_std": 0.24756039182345072, "rewards/reward_fn/mean": 0.7427690227826437, "rewards/reward_fn/std": 0.24756039679050446, "sampling/importance_sampling_ratio/max": 1.5592605272928874, "sampling/importance_sampling_ratio/mean": 0.33925533294677734, "sampling/importance_sampling_ratio/min": 2.3075019195554585e-05, "sampling/sampling_logp_difference/max": 2.4806787570317588, "sampling/sampling_logp_difference/mean": 0.005234446221341689, "step": 7390, "step_time": 10.289765996672212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2240.5, "completions/mean_length": 1770.34375, "completions/mean_terminated_length": 612.86279296875, "completions/min_length": 168.5, "completions/min_terminated_length": 168.5, "entropy": 0.023799567949026823, "epoch": 0.8894230769230769, "frac_reward_zero_std": 0.0, "grad_norm": 0.0028603808023035526, "learning_rate": 1.1069711538461537e-07, "loss": -0.0002, "num_tokens": 147788193.0, "reward": 0.6595389544963837, "reward_std": 0.27222467958927155, "rewards/reward_fn/mean": 0.6595389544963837, "rewards/reward_fn/std": 0.27222466468811035, "sampling/importance_sampling_ratio/max": 0.8181717097759247, "sampling/importance_sampling_ratio/mean": 0.1606041006743908, "sampling/importance_sampling_ratio/min": 2.8385743917169748e-05, "sampling/sampling_logp_difference/max": 3.377263844013214, "sampling/sampling_logp_difference/mean": 0.004585251212120056, "step": 7400, "step_time": 7.317410637810826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.11458333333333333, "completions/max_length": 3000.0, "completions/max_terminated_length": 2380.0, "completions/mean_length": 751.0208333333334, "completions/mean_terminated_length": 467.2629089355469, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.02387074865400791, "epoch": 0.890625, "frac_reward_zero_std": 0.0, "grad_norm": 0.02783520147204399, "learning_rate": 1.094951923076923e-07, "loss": 0.0061, "num_tokens": 147960683.0, "reward": 0.8358741601308187, "reward_std": 0.2014412780602773, "rewards/reward_fn/mean": 0.8358741601308187, "rewards/reward_fn/std": 0.20144127309322357, "sampling/importance_sampling_ratio/max": 1.2576530774434407, "sampling/importance_sampling_ratio/mean": 0.4226325253645579, "sampling/importance_sampling_ratio/min": 0.0002117262677832817, "sampling/sampling_logp_difference/max": 2.2849344412485757, "sampling/sampling_logp_difference/mean": 0.0057190884836018085, "step": 7410, "step_time": 10.014462502952664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 3000.0, "completions/max_terminated_length": 1249.5, "completions/mean_length": 1222.203125, "completions/mean_terminated_length": 347.3399200439453, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.023530623130500316, "epoch": 0.8918269230769231, "frac_reward_zero_std": 0.0, "grad_norm": 0.012046712450683117, "learning_rate": 1.0829326923076923e-07, "loss": -0.0025, "num_tokens": 148147272.0, "reward": 0.7275427877902985, "reward_std": 0.24097993224859238, "rewards/reward_fn/mean": 0.7275427877902985, "rewards/reward_fn/std": 0.24097993224859238, "sampling/importance_sampling_ratio/max": 1.746551752090454, "sampling/importance_sampling_ratio/mean": 0.45554348826408386, "sampling/importance_sampling_ratio/min": 0.00011154783351230435, "sampling/sampling_logp_difference/max": 5.030673265457153, "sampling/sampling_logp_difference/mean": 0.00488872011192143, "step": 7420, "step_time": 7.30663424981758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 2566.6666666666665, "completions/max_terminated_length": 1668.0, "completions/mean_length": 945.1770833333334, "completions/mean_terminated_length": 491.1376953125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.026017476804554462, "epoch": 0.8930288461538461, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.03666513413190842, "learning_rate": 1.0709134615384614e-07, "loss": 0.01, "num_tokens": 148339681.0, "reward": 0.6814051071802775, "reward_std": 0.3073769211769104, "rewards/reward_fn/mean": 0.6814051071802775, "rewards/reward_fn/std": 0.3073769013086955, "sampling/importance_sampling_ratio/max": 1.5055168867111206, "sampling/importance_sampling_ratio/mean": 0.38186747829119366, "sampling/importance_sampling_ratio/min": 0.0006316590970527614, "sampling/sampling_logp_difference/max": 3.791733225186666, "sampling/sampling_logp_difference/mean": 0.00619232002645731, "step": 7430, "step_time": 8.682967233099044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 991.765625, "completions/mean_terminated_length": 491.50941467285156, "completions/min_length": 176.5, "completions/min_terminated_length": 176.5, "entropy": 0.02671054992824793, "epoch": 0.8942307692307693, "frac_reward_zero_std": 0.0, "grad_norm": 0.0011601871810853481, "learning_rate": 1.0588942307692307e-07, "loss": 0.0057, "num_tokens": 148483338.0, "reward": 0.7612771391868591, "reward_std": 0.25066807866096497, "rewards/reward_fn/mean": 0.7612771391868591, "rewards/reward_fn/std": 0.25066809356212616, "sampling/importance_sampling_ratio/max": 1.2765768766403198, "sampling/importance_sampling_ratio/mean": 0.32692594081163406, "sampling/importance_sampling_ratio/min": 4.1991121406681486e-05, "sampling/sampling_logp_difference/max": 2.0133216381073, "sampling/sampling_logp_difference/mean": 0.00603694561868906, "step": 7440, "step_time": 6.936040192190558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.16666666666666666, "completions/max_length": 3000.0, "completions/max_terminated_length": 1808.3333333333333, "completions/mean_length": 892.1875, "completions/mean_terminated_length": 467.33315022786456, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.02422291338443756, "epoch": 0.8954326923076923, "frac_reward_zero_std": 0.0, "grad_norm": 0.0471547432243824, "learning_rate": 1.046875e-07, "loss": -0.0049, "num_tokens": 148675564.0, "reward": 0.8037102023760477, "reward_std": 0.23786724110444388, "rewards/reward_fn/mean": 0.8037102023760477, "rewards/reward_fn/std": 0.2378672460714976, "sampling/importance_sampling_ratio/max": 1.2910267909367878, "sampling/importance_sampling_ratio/mean": 0.40833840767542523, "sampling/importance_sampling_ratio/min": 0.00016996245055148998, "sampling/sampling_logp_difference/max": 1.885620911916097, "sampling/sampling_logp_difference/mean": 0.005605286918580532, "step": 7450, "step_time": 10.019524520635605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 3000.0, "completions/max_terminated_length": 1462.0, "completions/mean_length": 852.375, "completions/mean_terminated_length": 411.36000061035156, "completions/min_length": 115.5, "completions/min_terminated_length": 115.5, "entropy": 0.02328806035220623, "epoch": 0.8966346153846154, "frac_reward_zero_std": 0.0, "grad_norm": 0.002944865031167865, "learning_rate": 1.034855769230769e-07, "loss": 0.0014, "num_tokens": 148805020.0, "reward": 0.757118284702301, "reward_std": 0.24948205053806305, "rewards/reward_fn/mean": 0.757118284702301, "rewards/reward_fn/std": 0.24948207288980484, "sampling/importance_sampling_ratio/max": 1.1794378459453583, "sampling/importance_sampling_ratio/mean": 0.3731890916824341, "sampling/importance_sampling_ratio/min": 0.0002419702723273076, "sampling/sampling_logp_difference/max": 2.1411141753196716, "sampling/sampling_logp_difference/mean": 0.005378400441259146, "step": 7460, "step_time": 6.845286812074482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2916666666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 2056.6666666666665, "completions/mean_length": 1282.6458333333333, "completions/mean_terminated_length": 557.19482421875, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.02401028908789158, "epoch": 0.8978365384615384, "frac_reward_zero_std": 0.0, "grad_norm": 0.03606105595827103, "learning_rate": 1.0228365384615384e-07, "loss": 0.0006, "num_tokens": 149033706.0, "reward": 0.7212531566619873, "reward_std": 0.25010235607624054, "rewards/reward_fn/mean": 0.7212531566619873, "rewards/reward_fn/std": 0.25010234614213306, "sampling/importance_sampling_ratio/max": 1.4628093441327412, "sampling/importance_sampling_ratio/mean": 0.3260755290587743, "sampling/importance_sampling_ratio/min": 5.662754462794813e-05, "sampling/sampling_logp_difference/max": 3.025109847386678, "sampling/sampling_logp_difference/mean": 0.005879023422797521, "step": 7470, "step_time": 10.39213576382026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 3000.0, "completions/max_terminated_length": 1152.0, "completions/mean_length": 734.046875, "completions/mean_terminated_length": 410.3393096923828, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.025515443086624144, "epoch": 0.8990384615384616, "frac_reward_zero_std": 0.0, "grad_norm": 0.006124628707766533, "learning_rate": 1.0108173076923077e-07, "loss": -0.0035, "num_tokens": 149154013.0, "reward": 0.8125390112400055, "reward_std": 0.21390778571367264, "rewards/reward_fn/mean": 0.8125390112400055, "rewards/reward_fn/std": 0.21390777081251144, "sampling/importance_sampling_ratio/max": 1.2324548959732056, "sampling/importance_sampling_ratio/mean": 0.3480503708124161, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.8977662324905396, "sampling/sampling_logp_difference/mean": 0.00577492406591773, "step": 7480, "step_time": 6.846916056424379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3020833333333333, "completions/max_length": 3000.0, "completions/max_terminated_length": 2375.0, "completions/mean_length": 1240.5208333333333, "completions/mean_terminated_length": 481.2536214192708, "completions/min_length": 115.33333333333333, "completions/min_terminated_length": 115.33333333333333, "entropy": 0.023895590007305144, "epoch": 0.9002403846153846, "frac_reward_zero_std": 0.0, "grad_norm": 0.01197132933884859, "learning_rate": 9.98798076923077e-08, "loss": 0.0002, "num_tokens": 149388591.0, "reward": 0.7081373532613119, "reward_std": 0.25501670440038043, "rewards/reward_fn/mean": 0.7081373532613119, "rewards/reward_fn/std": 0.255016694466273, "sampling/importance_sampling_ratio/max": 1.494978706041972, "sampling/importance_sampling_ratio/mean": 0.39429381489753723, "sampling/importance_sampling_ratio/min": 0.00039985198774653935, "sampling/sampling_logp_difference/max": 2.141720175743103, "sampling/sampling_logp_difference/mean": 0.005018363706767559, "step": 7490, "step_time": 10.10414308262989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3000.0, "completions/max_terminated_length": 1176.5, "completions/mean_length": 987.78125, "completions/mean_terminated_length": 305.9214401245117, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.021950899809598922, "epoch": 0.9014423076923077, "frac_reward_zero_std": 0.125, "grad_norm": 0.005100979004055262, "learning_rate": 9.867788461538461e-08, "loss": -0.0104, "num_tokens": 149532305.0, "reward": 0.7646174132823944, "reward_std": 0.21969882398843765, "rewards/reward_fn/mean": 0.7646174132823944, "rewards/reward_fn/std": 0.21969883143901825, "sampling/importance_sampling_ratio/max": 1.828898549079895, "sampling/importance_sampling_ratio/mean": 0.44360075891017914, "sampling/importance_sampling_ratio/min": 0.0005129986857355107, "sampling/sampling_logp_difference/max": 1.5096490979194641, "sampling/sampling_logp_difference/mean": 0.0054928939789533615, "step": 7500, "step_time": 6.865829965937882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 3000.0, "completions/max_terminated_length": 1640.0, "completions/mean_length": 1686.8229166666667, "completions/mean_terminated_length": 592.1142171223959, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "entropy": 0.0219386282376945, "epoch": 0.9026442307692307, "frac_reward_zero_std": 0.0, "grad_norm": 0.011590865440666676, "learning_rate": 9.747596153846154e-08, "loss": -0.0078, "num_tokens": 149801824.0, "reward": 0.6429856618245443, "reward_std": 0.25830261905988056, "rewards/reward_fn/mean": 0.6429856618245443, "rewards/reward_fn/std": 0.25830261905988056, "sampling/importance_sampling_ratio/max": 2.0451844533284507, "sampling/importance_sampling_ratio/mean": 0.29715976615746814, "sampling/importance_sampling_ratio/min": 9.843799868273587e-06, "sampling/sampling_logp_difference/max": 2.773058374722799, "sampling/sampling_logp_difference/mean": 0.005258500886460145, "step": 7510, "step_time": 10.37885792274028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 3000.0, "completions/max_terminated_length": 1825.0, "completions/mean_length": 898.640625, "completions/mean_terminated_length": 427.1835632324219, "completions/min_length": 122.5, "completions/min_terminated_length": 122.5, "entropy": 0.026795326545834542, "epoch": 0.9038461538461539, "frac_reward_zero_std": 0.0, "grad_norm": 0.012723995372653008, "learning_rate": 9.627403846153846e-08, "loss": 0.0027, "num_tokens": 149932217.0, "reward": 0.7107701599597931, "reward_std": 0.29066669940948486, "rewards/reward_fn/mean": 0.7107701599597931, "rewards/reward_fn/std": 0.29066669940948486, "sampling/importance_sampling_ratio/max": 1.8569095730781555, "sampling/importance_sampling_ratio/mean": 0.49911993741989136, "sampling/importance_sampling_ratio/min": 0.0023256261526967137, "sampling/sampling_logp_difference/max": 2.6930602192878723, "sampling/sampling_logp_difference/mean": 0.005793992429971695, "step": 7520, "step_time": 6.5861680536530915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.22916666666666666, "completions/max_length": 3000.0, "completions/max_terminated_length": 2397.3333333333335, "completions/mean_length": 1093.2916666666667, "completions/mean_terminated_length": 537.8819986979166, "completions/min_length": 117.33333333333333, "completions/min_terminated_length": 117.33333333333333, "entropy": 0.02016293928027153, "epoch": 0.9050480769230769, "frac_reward_zero_std": 0.0, "grad_norm": 0.004981832578778267, "learning_rate": 9.507211538461537e-08, "loss": -0.0008, "num_tokens": 150143061.0, "reward": 0.678200344244639, "reward_std": 0.29073819021383923, "rewards/reward_fn/mean": 0.678200344244639, "rewards/reward_fn/std": 0.29073819518089294, "sampling/importance_sampling_ratio/max": 1.320523460706075, "sampling/importance_sampling_ratio/mean": 0.44131384293238324, "sampling/importance_sampling_ratio/min": 2.7598631239319122e-05, "sampling/sampling_logp_difference/max": 2.9974742333094277, "sampling/sampling_logp_difference/mean": 0.004499164720376332, "step": 7530, "step_time": 10.373406297620386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 3000.0, "completions/max_terminated_length": 2234.5, "completions/mean_length": 948.4375, "completions/mean_terminated_length": 480.21728515625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.022612263541668655, "epoch": 0.90625, "frac_reward_zero_std": 0.125, "grad_norm": 0.009247737936675549, "learning_rate": 9.38701923076923e-08, "loss": -0.0077, "num_tokens": 150423193.0, "reward": 0.6076427102088928, "reward_std": 0.3866080045700073, "rewards/reward_fn/mean": 0.6076427102088928, "rewards/reward_fn/std": 0.3866080045700073, "sampling/importance_sampling_ratio/max": 1.656259536743164, "sampling/importance_sampling_ratio/mean": 0.4597812592983246, "sampling/importance_sampling_ratio/min": 3.6871025258733425e-05, "sampling/sampling_logp_difference/max": 2.7015069723129272, "sampling/sampling_logp_difference/mean": 0.005178103689104319, "step": 7540, "step_time": 8.756587490439415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3000.0, "completions/max_terminated_length": 2013.6666666666667, "completions/mean_length": 1097.6354166666667, "completions/mean_terminated_length": 456.06394449869794, "completions/min_length": 120.33333333333333, "completions/min_terminated_length": 120.33333333333333, "entropy": 0.020615937374532222, "epoch": 0.9074519230769231, "frac_reward_zero_std": 0.16666666666666666, "grad_norm": 0.0040368372574448586, "learning_rate": 9.266826923076923e-08, "loss": -0.002, "num_tokens": 150628982.0, "reward": 0.6439030567804972, "reward_std": 0.33256151775519055, "rewards/reward_fn/mean": 0.6439030567804972, "rewards/reward_fn/std": 0.33256151775519055, "sampling/importance_sampling_ratio/max": 1.2269320090611775, "sampling/importance_sampling_ratio/mean": 0.38686784108479816, "sampling/importance_sampling_ratio/min": 6.534597863113352e-05, "sampling/sampling_logp_difference/max": 2.7871155738830566, "sampling/sampling_logp_difference/mean": 0.00469689816236496, "step": 7550, "step_time": 10.397084212582559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 3000.0, "completions/max_terminated_length": 1425.0, "completions/mean_length": 1391.234375, "completions/mean_terminated_length": 617.4344482421875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.023448551632463933, "epoch": 0.9086538461538461, "frac_reward_zero_std": 0.0, "grad_norm": 0.004472734406590462, "learning_rate": 9.146634615384615e-08, "loss": -0.0132, "num_tokens": 150791285.0, "reward": 0.7416820526123047, "reward_std": 0.22331593930721283, "rewards/reward_fn/mean": 0.7416820526123047, "rewards/reward_fn/std": 0.22331595420837402, "sampling/importance_sampling_ratio/max": 2.131582498550415, "sampling/importance_sampling_ratio/mean": 0.34319640696048737, "sampling/importance_sampling_ratio/min": 0.00013242067598184804, "sampling/sampling_logp_difference/max": 4.752918720245361, "sampling/sampling_logp_difference/mean": 0.005056134657934308, "step": 7560, "step_time": 7.01577072525397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2679.6666666666665, "completions/mean_length": 1416.5520833333333, "completions/mean_terminated_length": 579.7553304036459, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.02595179509371519, "epoch": 0.9098557692307693, "frac_reward_zero_std": 0.0, "grad_norm": 0.030144017189741135, "learning_rate": 9.026442307692306e-08, "loss": -0.0053, "num_tokens": 151042634.0, "reward": 0.6921668450037638, "reward_std": 0.2805541356404622, "rewards/reward_fn/mean": 0.6921668450037638, "rewards/reward_fn/std": 0.2805541306734085, "sampling/importance_sampling_ratio/max": 1.7143583297729492, "sampling/importance_sampling_ratio/mean": 0.32729512453079224, "sampling/importance_sampling_ratio/min": 7.613641052254631e-05, "sampling/sampling_logp_difference/max": 3.246266523996989, "sampling/sampling_logp_difference/mean": 0.005541305989027023, "step": 7570, "step_time": 10.246717272605746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2080.5, "completions/mean_length": 1486.96875, "completions/mean_terminated_length": 694.4285736083984, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.02067892849445343, "epoch": 0.9110576923076923, "frac_reward_zero_std": 0.0, "grad_norm": 0.04441765323281288, "learning_rate": 8.90625e-08, "loss": 0.0079, "num_tokens": 151220104.0, "reward": 0.7260687053203583, "reward_std": 0.2548936605453491, "rewards/reward_fn/mean": 0.7260687053203583, "rewards/reward_fn/std": 0.2548936679959297, "sampling/importance_sampling_ratio/max": 1.391110360622406, "sampling/importance_sampling_ratio/mean": 0.2914687246084213, "sampling/importance_sampling_ratio/min": 9.630230852053501e-05, "sampling/sampling_logp_difference/max": 1.9019020199775696, "sampling/sampling_logp_difference/mean": 0.0046846086625009775, "step": 7580, "step_time": 7.075415592640638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.11458333333333333, "completions/max_length": 3000.0, "completions/max_terminated_length": 1681.3333333333333, "completions/mean_length": 694.8229166666666, "completions/mean_terminated_length": 402.7154846191406, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.023402905277907848, "epoch": 0.9122596153846154, "frac_reward_zero_std": 0.16666666666666666, "grad_norm": 0.005320425145328045, "learning_rate": 8.786057692307691e-08, "loss": -0.0051, "num_tokens": 151392447.0, "reward": 0.7676119605700175, "reward_std": 0.24405433734258017, "rewards/reward_fn/mean": 0.7676119605700175, "rewards/reward_fn/std": 0.24405433734258017, "sampling/importance_sampling_ratio/max": 1.4980304837226868, "sampling/importance_sampling_ratio/mean": 0.5019521911938986, "sampling/importance_sampling_ratio/min": 5.511035593978401e-05, "sampling/sampling_logp_difference/max": 2.8983359336853027, "sampling/sampling_logp_difference/mean": 0.004923022662599881, "step": 7590, "step_time": 10.026918361335992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1815.0, "completions/max_terminated_length": 645.5, "completions/mean_length": 503.875, "completions/mean_terminated_length": 201.22999572753906, "completions/min_length": 99.5, "completions/min_terminated_length": 99.5, "entropy": 0.016542264819145204, "epoch": 0.9134615384615384, "frac_reward_zero_std": 0.375, "grad_norm": 0.006225775461643934, "learning_rate": 8.665865384615384e-08, "loss": 0.0024, "num_tokens": 151496903.0, "reward": 0.6262556314468384, "reward_std": 0.39344529807567596, "rewards/reward_fn/mean": 0.6262556314468384, "rewards/reward_fn/std": 0.39344528317451477, "sampling/importance_sampling_ratio/max": 1.6495481729507446, "sampling/importance_sampling_ratio/mean": 0.6917306482791901, "sampling/importance_sampling_ratio/min": 9.123951895162463e-05, "sampling/sampling_logp_difference/max": 1.1023616790771484, "sampling/sampling_logp_difference/mean": 0.0035217597614973783, "step": 7600, "step_time": 4.394445089809596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2916666666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 2580.3333333333335, "completions/mean_length": 1366.8020833333333, "completions/mean_terminated_length": 704.5973510742188, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.022377376072108747, "epoch": 0.9146634615384616, "frac_reward_zero_std": 0.0, "grad_norm": 0.01006261259317398, "learning_rate": 8.545673076923077e-08, "loss": -0.0021, "num_tokens": 151732620.0, "reward": 0.7322112520535787, "reward_std": 0.2632448822259903, "rewards/reward_fn/mean": 0.7322112520535787, "rewards/reward_fn/std": 0.2632448524236679, "sampling/importance_sampling_ratio/max": 1.818622310956319, "sampling/importance_sampling_ratio/mean": 0.35362791021664935, "sampling/importance_sampling_ratio/min": 5.387636580659697e-05, "sampling/sampling_logp_difference/max": 2.2979928255081177, "sampling/sampling_logp_difference/mean": 0.005208233837038279, "step": 7610, "step_time": 10.199832140654326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 3000.0, "completions/max_terminated_length": 2217.5, "completions/mean_length": 1029.3125, "completions/mean_terminated_length": 585.2529907226562, "completions/min_length": 131.5, "completions/min_terminated_length": 131.5, "entropy": 0.02464922275394201, "epoch": 0.9158653846153846, "frac_reward_zero_std": 0.0, "grad_norm": 0.008909350261092186, "learning_rate": 8.42548076923077e-08, "loss": -0.0065, "num_tokens": 151860232.0, "reward": 0.7856850326061249, "reward_std": 0.22814174741506577, "rewards/reward_fn/mean": 0.7856850326061249, "rewards/reward_fn/std": 0.22814173251390457, "sampling/importance_sampling_ratio/max": 1.230878233909607, "sampling/importance_sampling_ratio/mean": 0.3505781441926956, "sampling/importance_sampling_ratio/min": 0.00040071838157018647, "sampling/sampling_logp_difference/max": 1.2834688425064087, "sampling/sampling_logp_difference/mean": 0.00539809837937355, "step": 7620, "step_time": 6.982736258767545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.16666666666666666, "completions/max_length": 3000.0, "completions/max_terminated_length": 1405.0, "completions/mean_length": 863.46875, "completions/mean_terminated_length": 436.60215250651044, "completions/min_length": 113.33333333333333, "completions/min_terminated_length": 113.33333333333333, "entropy": 0.023394824657589196, "epoch": 0.9170673076923077, "frac_reward_zero_std": 0.16666666666666666, "grad_norm": 0.007861290127038956, "learning_rate": 8.30528846153846e-08, "loss": 0.0026, "num_tokens": 152047901.0, "reward": 0.6908499002456665, "reward_std": 0.33244749903678894, "rewards/reward_fn/mean": 0.6908499002456665, "rewards/reward_fn/std": 0.33244748910268146, "sampling/importance_sampling_ratio/max": 1.9508239030838013, "sampling/importance_sampling_ratio/mean": 0.4961552619934082, "sampling/importance_sampling_ratio/min": 0.00014175191366424164, "sampling/sampling_logp_difference/max": 1.693197210629781, "sampling/sampling_logp_difference/mean": 0.00511311087757349, "step": 7630, "step_time": 10.200914450269192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 3000.0, "completions/max_terminated_length": 1201.5, "completions/mean_length": 951.59375, "completions/mean_terminated_length": 385.1352844238281, "completions/min_length": 127.5, "completions/min_terminated_length": 127.5, "entropy": 0.021911515947431327, "epoch": 0.9182692307692307, "frac_reward_zero_std": 0.0, "grad_norm": 0.009145568124949932, "learning_rate": 8.185096153846153e-08, "loss": 0.0113, "num_tokens": 152177259.0, "reward": 0.7883379459381104, "reward_std": 0.23313291370868683, "rewards/reward_fn/mean": 0.7883379459381104, "rewards/reward_fn/std": 0.23313292115926743, "sampling/importance_sampling_ratio/max": 1.7197271585464478, "sampling/importance_sampling_ratio/mean": 0.47118204832077026, "sampling/importance_sampling_ratio/min": 2.4766318347246852e-05, "sampling/sampling_logp_difference/max": 2.2342554330825806, "sampling/sampling_logp_difference/mean": 0.005080868722870946, "step": 7640, "step_time": 6.96397555610165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.10416666666666667, "completions/max_length": 2634.3333333333335, "completions/max_terminated_length": 1982.0, "completions/mean_length": 697.0208333333334, "completions/mean_terminated_length": 432.6496988932292, "completions/min_length": 138.66666666666666, "completions/min_terminated_length": 138.66666666666666, "entropy": 0.020408103801310064, "epoch": 0.9194711538461539, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.010375679470598698, "learning_rate": 8.064903846153846e-08, "loss": 0.0107, "num_tokens": 152336701.0, "reward": 0.8371870517730713, "reward_std": 0.151638379941384, "rewards/reward_fn/mean": 0.8371870517730713, "rewards/reward_fn/std": 0.15163838180402914, "sampling/importance_sampling_ratio/max": 1.433179259300232, "sampling/importance_sampling_ratio/mean": 0.5209020773569742, "sampling/importance_sampling_ratio/min": 8.329253839889361e-05, "sampling/sampling_logp_difference/max": 2.530484676361084, "sampling/sampling_logp_difference/mean": 0.00432466894077758, "step": 7650, "step_time": 8.671835376601667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 3000.0, "completions/max_terminated_length": 1823.0, "completions/mean_length": 700.484375, "completions/mean_terminated_length": 455.1893005371094, "completions/min_length": 97.5, "completions/min_terminated_length": 97.5, "entropy": 0.020837727189064025, "epoch": 0.9206730769230769, "frac_reward_zero_std": 0.125, "grad_norm": 0.008813225664198399, "learning_rate": 7.944711538461538e-08, "loss": -0.0075, "num_tokens": 152458924.0, "reward": 0.849463015794754, "reward_std": 0.16238375008106232, "rewards/reward_fn/mean": 0.849463015794754, "rewards/reward_fn/std": 0.16238374263048172, "sampling/importance_sampling_ratio/max": 2.2283538579940796, "sampling/importance_sampling_ratio/mean": 0.5750009715557098, "sampling/importance_sampling_ratio/min": 0.00020095201034564525, "sampling/sampling_logp_difference/max": 2.8448755741119385, "sampling/sampling_logp_difference/mean": 0.004800786962732673, "step": 7660, "step_time": 6.6396821267902855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.11458333333333333, "completions/max_length": 3000.0, "completions/max_terminated_length": 2536.3333333333335, "completions/mean_length": 955.5520833333334, "completions/mean_terminated_length": 701.2977905273438, "completions/min_length": 206.66666666666666, "completions/min_terminated_length": 206.66666666666666, "entropy": 0.024851907044649124, "epoch": 0.921875, "frac_reward_zero_std": 0.0, "grad_norm": 0.0153171606361866, "learning_rate": 7.824519230769231e-08, "loss": -0.0112, "num_tokens": 152658441.0, "reward": 0.8257758418718973, "reward_std": 0.18666058282057443, "rewards/reward_fn/mean": 0.8257758418718973, "rewards/reward_fn/std": 0.18666057785352072, "sampling/importance_sampling_ratio/max": 1.716183026631673, "sampling/importance_sampling_ratio/mean": 0.3454395532608032, "sampling/importance_sampling_ratio/min": 6.825587964461495e-05, "sampling/sampling_logp_difference/max": 2.137394746144613, "sampling/sampling_logp_difference/mean": 0.005416997087498506, "step": 7670, "step_time": 10.087784818559886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 3000.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 1284.96875, "completions/mean_terminated_length": 676.0146789550781, "completions/min_length": 174.5, "completions/min_terminated_length": 174.5, "entropy": 0.02472005654126406, "epoch": 0.9230769230769231, "frac_reward_zero_std": 0.0, "grad_norm": 0.002996362280100584, "learning_rate": 7.704326923076924e-08, "loss": 0.0247, "num_tokens": 152829063.0, "reward": 0.762664407491684, "reward_std": 0.25374244153499603, "rewards/reward_fn/mean": 0.762664407491684, "rewards/reward_fn/std": 0.25374243408441544, "sampling/importance_sampling_ratio/max": 1.9668455123901367, "sampling/importance_sampling_ratio/mean": 0.2881523668766022, "sampling/importance_sampling_ratio/min": 0.00010913185946037629, "sampling/sampling_logp_difference/max": 3.3555731177330017, "sampling/sampling_logp_difference/mean": 0.005627075210213661, "step": 7680, "step_time": 7.221519209444523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.14583333333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 1645.3333333333333, "completions/mean_length": 814.90625, "completions/mean_terminated_length": 434.49424235026044, "completions/min_length": 122.66666666666667, "completions/min_terminated_length": 122.66666666666667, "entropy": 0.021214120741933584, "epoch": 0.9242788461538461, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.010708819143474102, "learning_rate": 7.584134615384614e-08, "loss": -0.0101, "num_tokens": 153003222.0, "reward": 0.7168106436729431, "reward_std": 0.2917083998521169, "rewards/reward_fn/mean": 0.7168106436729431, "rewards/reward_fn/std": 0.2917083849509557, "sampling/importance_sampling_ratio/max": 1.980438192685445, "sampling/importance_sampling_ratio/mean": 0.5092085202534994, "sampling/importance_sampling_ratio/min": 7.475557158424333e-05, "sampling/sampling_logp_difference/max": 2.387693246205648, "sampling/sampling_logp_difference/mean": 0.005185612166921298, "step": 7690, "step_time": 9.939658845588564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2336.5, "completions/mean_length": 1508.328125, "completions/mean_terminated_length": 822.7882080078125, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.022035762667655945, "epoch": 0.9254807692307693, "frac_reward_zero_std": 0.0, "grad_norm": 0.0046994928270578384, "learning_rate": 7.463942307692307e-08, "loss": 0.0038, "num_tokens": 153174875.0, "reward": 0.5750629305839539, "reward_std": 0.31673048436641693, "rewards/reward_fn/mean": 0.5750629305839539, "rewards/reward_fn/std": 0.31673048436641693, "sampling/importance_sampling_ratio/max": 1.1052957773208618, "sampling/importance_sampling_ratio/mean": 0.3010253421962261, "sampling/importance_sampling_ratio/min": 0.0002605913349924549, "sampling/sampling_logp_difference/max": 2.981741428375244, "sampling/sampling_logp_difference/mean": 0.004808577708899975, "step": 7700, "step_time": 6.998927946854383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 2787.3333333333335, "completions/max_terminated_length": 2484.6666666666665, "completions/mean_length": 993.3333333333334, "completions/mean_terminated_length": 570.0724283854166, "completions/min_length": 115.33333333333333, "completions/min_terminated_length": 115.33333333333333, "entropy": 0.025550145469605923, "epoch": 0.9266826923076923, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.040202222764492035, "learning_rate": 7.34375e-08, "loss": -0.0111, "num_tokens": 153379475.0, "reward": 0.7801185846328735, "reward_std": 0.21051724255084991, "rewards/reward_fn/mean": 0.7801185846328735, "rewards/reward_fn/std": 0.21051723261674246, "sampling/importance_sampling_ratio/max": 1.5145415465037029, "sampling/importance_sampling_ratio/mean": 0.40265915791193646, "sampling/importance_sampling_ratio/min": 0.0036968793371367306, "sampling/sampling_logp_difference/max": 1.4806833267211914, "sampling/sampling_logp_difference/mean": 0.005806875880807638, "step": 7710, "step_time": 9.437943210452795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 3000.0, "completions/max_terminated_length": 2295.5, "completions/mean_length": 1100.9375, "completions/mean_terminated_length": 579.1368865966797, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.023869127221405505, "epoch": 0.9278846153846154, "frac_reward_zero_std": 0.0, "grad_norm": 0.0036510087084025145, "learning_rate": 7.223557692307691e-08, "loss": -0.0004, "num_tokens": 153529519.0, "reward": 0.7403983175754547, "reward_std": 0.27938616275787354, "rewards/reward_fn/mean": 0.7403983175754547, "rewards/reward_fn/std": 0.27938615530729294, "sampling/importance_sampling_ratio/max": 0.9005981683731079, "sampling/importance_sampling_ratio/mean": 0.3164163827896118, "sampling/importance_sampling_ratio/min": 1.555213100346009e-05, "sampling/sampling_logp_difference/max": 3.272798478603363, "sampling/sampling_logp_difference/mean": 0.005728658055886626, "step": 7720, "step_time": 7.259303708467632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 3000.0, "completions/max_terminated_length": 2314.0, "completions/mean_length": 1048.34375, "completions/mean_terminated_length": 504.4028625488281, "completions/min_length": 123.66666666666667, "completions/min_terminated_length": 123.66666666666667, "entropy": 0.02548861736431718, "epoch": 0.9290865384615384, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.007188319694250822, "learning_rate": 7.103365384615384e-08, "loss": -0.0023, "num_tokens": 153715952.0, "reward": 0.6576840480168661, "reward_std": 0.3152591735124588, "rewards/reward_fn/mean": 0.6576840480168661, "rewards/reward_fn/std": 0.3152591635783513, "sampling/importance_sampling_ratio/max": 1.3224559624989827, "sampling/importance_sampling_ratio/mean": 0.380905419588089, "sampling/importance_sampling_ratio/min": 0.0007364111349185501, "sampling/sampling_logp_difference/max": 2.955739180246989, "sampling/sampling_logp_difference/mean": 0.005024171589563291, "step": 7730, "step_time": 10.107157300319523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 3000.0, "completions/max_terminated_length": 1379.5, "completions/mean_length": 1513.265625, "completions/mean_terminated_length": 356.91666412353516, "completions/min_length": 94.5, "completions/min_terminated_length": 94.5, "entropy": 0.02098573911935091, "epoch": 0.9302884615384616, "frac_reward_zero_std": 0.0, "grad_norm": 0.0060958461835980415, "learning_rate": 6.983173076923077e-08, "loss": 0.0079, "num_tokens": 153881153.0, "reward": 0.5985312759876251, "reward_std": 0.29948437213897705, "rewards/reward_fn/mean": 0.5985312759876251, "rewards/reward_fn/std": 0.29948437213897705, "sampling/importance_sampling_ratio/max": 1.8541702032089233, "sampling/importance_sampling_ratio/mean": 0.4576105326414108, "sampling/importance_sampling_ratio/min": 1.4104207195941854e-05, "sampling/sampling_logp_difference/max": 4.798595428466797, "sampling/sampling_logp_difference/mean": 0.005132420687004924, "step": 7740, "step_time": 7.172427290212363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.17708333333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 1736.0, "completions/mean_length": 904.0520833333334, "completions/mean_terminated_length": 455.21844482421875, "completions/min_length": 104.66666666666667, "completions/min_terminated_length": 104.66666666666667, "entropy": 0.026654193922877312, "epoch": 0.9314903846153846, "frac_reward_zero_std": 0.0, "grad_norm": 0.008381958119571209, "learning_rate": 6.862980769230769e-08, "loss": 0.0032, "num_tokens": 154070110.0, "reward": 0.7649306456247965, "reward_std": 0.2439268430074056, "rewards/reward_fn/mean": 0.7649306456247965, "rewards/reward_fn/std": 0.24392683307329813, "sampling/importance_sampling_ratio/max": 1.4296890099843342, "sampling/importance_sampling_ratio/mean": 0.4255981345971425, "sampling/importance_sampling_ratio/min": 0.00023554392828373238, "sampling/sampling_logp_difference/max": 2.0596953630447388, "sampling/sampling_logp_difference/mean": 0.006077323574572802, "step": 7750, "step_time": 10.120254872553051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 3000.0, "completions/max_terminated_length": 1734.5, "completions/mean_length": 854.765625, "completions/mean_terminated_length": 409.24073791503906, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.02452252171933651, "epoch": 0.9326923076923077, "frac_reward_zero_std": 0.0, "grad_norm": 0.015755223110318184, "learning_rate": 6.74278846153846e-08, "loss": -0.0031, "num_tokens": 154195847.0, "reward": 0.7941393554210663, "reward_std": 0.20356485247612, "rewards/reward_fn/mean": 0.7941393554210663, "rewards/reward_fn/std": 0.20356485247612, "sampling/importance_sampling_ratio/max": 1.5049638152122498, "sampling/importance_sampling_ratio/mean": 0.545969158411026, "sampling/importance_sampling_ratio/min": 2.6498322085899417e-05, "sampling/sampling_logp_difference/max": 2.8964074850082397, "sampling/sampling_logp_difference/mean": 0.005221407627686858, "step": 7760, "step_time": 6.864735059160739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2604166666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 2065.0, "completions/mean_length": 1163.6145833333333, "completions/mean_terminated_length": 506.135009765625, "completions/min_length": 123.66666666666667, "completions/min_terminated_length": 123.66666666666667, "entropy": 0.023835274018347265, "epoch": 0.9338942307692307, "frac_reward_zero_std": 0.0, "grad_norm": 0.005578158888965845, "learning_rate": 6.622596153846154e-08, "loss": -0.0008, "num_tokens": 154422186.0, "reward": 0.7353414297103882, "reward_std": 0.23783555130163828, "rewards/reward_fn/mean": 0.7353414297103882, "rewards/reward_fn/std": 0.23783554136753082, "sampling/importance_sampling_ratio/max": 1.1121115883191426, "sampling/importance_sampling_ratio/mean": 0.33996587495009106, "sampling/importance_sampling_ratio/min": 1.1697819597126605e-05, "sampling/sampling_logp_difference/max": 1.4900113344192505, "sampling/sampling_logp_difference/mean": 0.005415990793456634, "step": 7770, "step_time": 10.264192867279053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 2137.5, "completions/max_terminated_length": 2111.5, "completions/mean_length": 695.90625, "completions/mean_terminated_length": 476.76202392578125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.028625539131462575, "epoch": 0.9350961538461539, "frac_reward_zero_std": 0.0, "grad_norm": 0.014659915119409561, "learning_rate": 6.502403846153847e-08, "loss": -0.0014, "num_tokens": 154527508.0, "reward": 0.817951112985611, "reward_std": 0.20466292649507523, "rewards/reward_fn/mean": 0.817951112985611, "rewards/reward_fn/std": 0.20466292649507523, "sampling/importance_sampling_ratio/max": 1.1813028454780579, "sampling/importance_sampling_ratio/mean": 0.3492683917284012, "sampling/importance_sampling_ratio/min": 5.094287189422175e-06, "sampling/sampling_logp_difference/max": 1.7829495668411255, "sampling/sampling_logp_difference/mean": 0.006715858355164528, "step": 7780, "step_time": 5.086402426101268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3229166666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 1840.0, "completions/mean_length": 1348.15625, "completions/mean_terminated_length": 576.036631266276, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.025116137973964215, "epoch": 0.9362980769230769, "frac_reward_zero_std": 0.0, "grad_norm": 0.008458830416202545, "learning_rate": 6.382211538461538e-08, "loss": -0.0071, "num_tokens": 154767427.0, "reward": 0.7128275235493978, "reward_std": 0.23879291117191315, "rewards/reward_fn/mean": 0.7128275235493978, "rewards/reward_fn/std": 0.23879289627075195, "sampling/importance_sampling_ratio/max": 1.3480412165323894, "sampling/importance_sampling_ratio/mean": 0.3045271883408229, "sampling/importance_sampling_ratio/min": 0.0001356091401021331, "sampling/sampling_logp_difference/max": 3.470893065134684, "sampling/sampling_logp_difference/mean": 0.005127652548253536, "step": 7790, "step_time": 10.344250881578773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 3000.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1533.328125, "completions/mean_terminated_length": 631.8871459960938, "completions/min_length": 202.5, "completions/min_terminated_length": 202.5, "entropy": 0.024354116804897786, "epoch": 0.9375, "frac_reward_zero_std": 0.0, "grad_norm": 0.018389659002423286, "learning_rate": 6.262019230769231e-08, "loss": -0.0035, "num_tokens": 154943712.0, "reward": 0.6638675630092621, "reward_std": 0.26674996316432953, "rewards/reward_fn/mean": 0.6638675630092621, "rewards/reward_fn/std": 0.26674995571374893, "sampling/importance_sampling_ratio/max": 1.3489108085632324, "sampling/importance_sampling_ratio/mean": 0.25099221616983414, "sampling/importance_sampling_ratio/min": 9.006145273815491e-05, "sampling/sampling_logp_difference/max": 2.6189520359039307, "sampling/sampling_logp_difference/mean": 0.004996292060241103, "step": 7800, "step_time": 7.371121418569237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2604166666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 1520.6666666666667, "completions/mean_length": 1133.9270833333333, "completions/mean_terminated_length": 483.4144592285156, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.02145428266376257, "epoch": 0.9387019230769231, "frac_reward_zero_std": 0.0, "grad_norm": 0.019023526459932327, "learning_rate": 6.141826923076923e-08, "loss": 0.012, "num_tokens": 155134353.0, "reward": 0.6962936917940775, "reward_std": 0.3077341616153717, "rewards/reward_fn/mean": 0.6962936917940775, "rewards/reward_fn/std": 0.3077341665824254, "sampling/importance_sampling_ratio/max": 1.7698726654052734, "sampling/importance_sampling_ratio/mean": 0.42244545618693036, "sampling/importance_sampling_ratio/min": 6.728350369182105e-05, "sampling/sampling_logp_difference/max": 2.300467610359192, "sampling/sampling_logp_difference/mean": 0.004589365795254707, "step": 7810, "step_time": 10.058049626927822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3000.0, "completions/max_terminated_length": 2791.0, "completions/mean_length": 1134.828125, "completions/mean_terminated_length": 516.8391265869141, "completions/min_length": 128.5, "completions/min_terminated_length": 128.5, "entropy": 0.025716573931276798, "epoch": 0.9399038461538461, "frac_reward_zero_std": 0.0, "grad_norm": 0.030497346073389053, "learning_rate": 6.021634615384616e-08, "loss": -0.0129, "num_tokens": 155289398.0, "reward": 0.7881846725940704, "reward_std": 0.23084315657615662, "rewards/reward_fn/mean": 0.7881846725940704, "rewards/reward_fn/std": 0.23084315657615662, "sampling/importance_sampling_ratio/max": 1.5511526465415955, "sampling/importance_sampling_ratio/mean": 0.35320964455604553, "sampling/importance_sampling_ratio/min": 1.2022107512166258e-05, "sampling/sampling_logp_difference/max": 1.9366034865379333, "sampling/sampling_logp_difference/mean": 0.005735594313591719, "step": 7820, "step_time": 6.979181376099587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2174.6666666666665, "completions/mean_length": 1304.9270833333333, "completions/mean_terminated_length": 690.4574788411459, "completions/min_length": 164.66666666666666, "completions/min_terminated_length": 164.66666666666666, "entropy": 0.022495807521045207, "epoch": 0.9411057692307693, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.005322665441781282, "learning_rate": 5.901442307692307e-08, "loss": 0.0101, "num_tokens": 155544199.0, "reward": 0.6616517305374146, "reward_std": 0.31458422541618347, "rewards/reward_fn/mean": 0.6616517305374146, "rewards/reward_fn/std": 0.3145842452843984, "sampling/importance_sampling_ratio/max": 1.1975582043329875, "sampling/importance_sampling_ratio/mean": 0.3669981559117635, "sampling/importance_sampling_ratio/min": 0.00010592152523258846, "sampling/sampling_logp_difference/max": 3.4124794801076255, "sampling/sampling_logp_difference/mean": 0.004582775291055441, "step": 7830, "step_time": 10.207055037096143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 3000.0, "completions/max_terminated_length": 2315.0, "completions/mean_length": 1336.84375, "completions/mean_terminated_length": 635.4802551269531, "completions/min_length": 121.5, "completions/min_terminated_length": 121.5, "entropy": 0.02448528874665499, "epoch": 0.9423076923076923, "frac_reward_zero_std": 0.0, "grad_norm": 0.009450988844037056, "learning_rate": 5.78125e-08, "loss": -0.0085, "num_tokens": 155702829.0, "reward": 0.6913867592811584, "reward_std": 0.2767977863550186, "rewards/reward_fn/mean": 0.6913867592811584, "rewards/reward_fn/std": 0.2767977863550186, "sampling/importance_sampling_ratio/max": 1.2094191312789917, "sampling/importance_sampling_ratio/mean": 0.3376618027687073, "sampling/importance_sampling_ratio/min": 1.3160872299522453e-05, "sampling/sampling_logp_difference/max": 2.3363629579544067, "sampling/sampling_logp_difference/mean": 0.005602208431810141, "step": 7840, "step_time": 7.108602107595653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 3000.0, "completions/max_terminated_length": 1971.3333333333333, "completions/mean_length": 901.3541666666666, "completions/mean_terminated_length": 525.4920756022135, "completions/min_length": 103.66666666666667, "completions/min_terminated_length": 103.66666666666667, "entropy": 0.02306621242314577, "epoch": 0.9435096153846154, "frac_reward_zero_std": 0.16666666666666666, "grad_norm": 0.008368675597012043, "learning_rate": 5.661057692307692e-08, "loss": -0.0016, "num_tokens": 155900143.0, "reward": 0.73486328125, "reward_std": 0.29423798620700836, "rewards/reward_fn/mean": 0.73486328125, "rewards/reward_fn/std": 0.29423798123995465, "sampling/importance_sampling_ratio/max": 1.6061615149180095, "sampling/importance_sampling_ratio/mean": 0.48205456137657166, "sampling/importance_sampling_ratio/min": 0.0026715144679959244, "sampling/sampling_logp_difference/max": 2.2442133029301963, "sampling/sampling_logp_difference/mean": 0.005524315405637026, "step": 7850, "step_time": 9.88020561169833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 3000.0, "completions/max_terminated_length": 2645.5, "completions/mean_length": 1721.96875, "completions/mean_terminated_length": 788.244140625, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.023935039155185224, "epoch": 0.9447115384615384, "frac_reward_zero_std": 0.0, "grad_norm": 0.0022274982184171677, "learning_rate": 5.540865384615384e-08, "loss": -0.0069, "num_tokens": 156098605.0, "reward": 0.6733593344688416, "reward_std": 0.2549806535243988, "rewards/reward_fn/mean": 0.6733593344688416, "rewards/reward_fn/std": 0.2549806460738182, "sampling/importance_sampling_ratio/max": 0.9382368922233582, "sampling/importance_sampling_ratio/mean": 0.20726759731769562, "sampling/importance_sampling_ratio/min": 1.9568833522498608e-06, "sampling/sampling_logp_difference/max": 1.791448950767517, "sampling/sampling_logp_difference/mean": 0.005302433855831623, "step": 7860, "step_time": 7.287622083537281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 3000.0, "completions/max_terminated_length": 1607.6666666666667, "completions/mean_length": 955.6041666666666, "completions/mean_terminated_length": 373.5620422363281, "completions/min_length": 124.33333333333333, "completions/min_terminated_length": 124.33333333333333, "entropy": 0.02214056523516774, "epoch": 0.9459134615384616, "frac_reward_zero_std": 0.16666666666666666, "grad_norm": 0.030773982405662537, "learning_rate": 5.420673076923077e-08, "loss": -0.0041, "num_tokens": 156290031.0, "reward": 0.6437143286069235, "reward_std": 0.34059170881907147, "rewards/reward_fn/mean": 0.6437143286069235, "rewards/reward_fn/std": 0.3405916889508565, "sampling/importance_sampling_ratio/max": 1.3529030084609985, "sampling/importance_sampling_ratio/mean": 0.4774252275625865, "sampling/importance_sampling_ratio/min": 7.831348009782839e-05, "sampling/sampling_logp_difference/max": 1.9482771555582683, "sampling/sampling_logp_difference/mean": 0.004672549820194642, "step": 7870, "step_time": 10.279620694555343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 3000.0, "completions/max_terminated_length": 2003.0, "completions/mean_length": 1354.3125, "completions/mean_terminated_length": 658.8214416503906, "completions/min_length": 195.5, "completions/min_terminated_length": 195.5, "entropy": 0.024266308173537254, "epoch": 0.9471153846153846, "frac_reward_zero_std": 0.0, "grad_norm": 0.0037685236893594265, "learning_rate": 5.300480769230769e-08, "loss": 0.0031, "num_tokens": 156441451.0, "reward": 0.7485523521900177, "reward_std": 0.22641711682081223, "rewards/reward_fn/mean": 0.7485523521900177, "rewards/reward_fn/std": 0.22641710937023163, "sampling/importance_sampling_ratio/max": 1.7405387163162231, "sampling/importance_sampling_ratio/mean": 0.3161941468715668, "sampling/importance_sampling_ratio/min": 4.92480285174679e-05, "sampling/sampling_logp_difference/max": 2.541625738143921, "sampling/sampling_logp_difference/mean": 0.005368562648072839, "step": 7880, "step_time": 7.097150971554219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 3000.0, "completions/max_terminated_length": 1279.6666666666667, "completions/mean_length": 805.0208333333334, "completions/mean_terminated_length": 398.52695719401044, "completions/min_length": 128.33333333333334, "completions/min_terminated_length": 128.33333333333334, "entropy": 0.026725776866078375, "epoch": 0.9483173076923077, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.01116994395852089, "learning_rate": 5.180288461538462e-08, "loss": 0.0123, "num_tokens": 156632301.0, "reward": 0.795872708161672, "reward_std": 0.19238253931204477, "rewards/reward_fn/mean": 0.795872708161672, "rewards/reward_fn/std": 0.19238253186146417, "sampling/importance_sampling_ratio/max": 1.7176645199457805, "sampling/importance_sampling_ratio/mean": 0.40801603595415753, "sampling/importance_sampling_ratio/min": 0.0023215979202480717, "sampling/sampling_logp_difference/max": 2.8683915535608926, "sampling/sampling_logp_difference/mean": 0.00617445632815361, "step": 7890, "step_time": 9.966826215945185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 3000.0, "completions/max_terminated_length": 1762.0, "completions/mean_length": 973.796875, "completions/mean_terminated_length": 410.4198913574219, "completions/min_length": 83.5, "completions/min_terminated_length": 83.5, "entropy": 0.02330870069563389, "epoch": 0.9495192307692307, "frac_reward_zero_std": 0.125, "grad_norm": 0.011481431312859058, "learning_rate": 5.060096153846154e-08, "loss": -0.0026, "num_tokens": 156763488.0, "reward": 0.7532200813293457, "reward_std": 0.25072702020406723, "rewards/reward_fn/mean": 0.7532200813293457, "rewards/reward_fn/std": 0.25072702020406723, "sampling/importance_sampling_ratio/max": 1.0914337635040283, "sampling/importance_sampling_ratio/mean": 0.4028462767601013, "sampling/importance_sampling_ratio/min": 8.708483392183552e-06, "sampling/sampling_logp_difference/max": 3.1737940311431885, "sampling/sampling_logp_difference/mean": 0.00539248320274055, "step": 7900, "step_time": 7.270251852832734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.20833333333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 1765.0, "completions/mean_length": 997.4479166666666, "completions/mean_terminated_length": 465.515380859375, "completions/min_length": 126.66666666666667, "completions/min_terminated_length": 126.66666666666667, "entropy": 0.02303301766514778, "epoch": 0.9507211538461539, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.004974342416971922, "learning_rate": 4.9399038461538456e-08, "loss": -0.0022, "num_tokens": 156970635.0, "reward": 0.631437083085378, "reward_std": 0.3337977280219396, "rewards/reward_fn/mean": 0.631437083085378, "rewards/reward_fn/std": 0.3337977131207784, "sampling/importance_sampling_ratio/max": 1.6331817309061687, "sampling/importance_sampling_ratio/mean": 0.4746861159801483, "sampling/importance_sampling_ratio/min": 4.2473776071953275e-05, "sampling/sampling_logp_difference/max": 1.7043744723002117, "sampling/sampling_logp_difference/mean": 0.004780279317249854, "step": 7910, "step_time": 10.346644201502205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 3000.0, "completions/max_terminated_length": 1781.0, "completions/mean_length": 1328.578125, "completions/mean_terminated_length": 390.19020080566406, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.022177705727517604, "epoch": 0.9519230769230769, "frac_reward_zero_std": 0.0, "grad_norm": 0.006984512321650982, "learning_rate": 4.8197115384615385e-08, "loss": 0.0028, "num_tokens": 157142680.0, "reward": 0.7361558377742767, "reward_std": 0.2367601990699768, "rewards/reward_fn/mean": 0.7361558377742767, "rewards/reward_fn/std": 0.2367601990699768, "sampling/importance_sampling_ratio/max": 1.2347660660743713, "sampling/importance_sampling_ratio/mean": 0.3357328027486801, "sampling/importance_sampling_ratio/min": 2.597196271381108e-06, "sampling/sampling_logp_difference/max": 2.084959864616394, "sampling/sampling_logp_difference/mean": 0.005165978102013469, "step": 7920, "step_time": 7.023464213125408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.23958333333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 1704.0, "completions/mean_length": 1009.6770833333334, "completions/mean_terminated_length": 430.8280029296875, "completions/min_length": 131.33333333333334, "completions/min_terminated_length": 131.33333333333334, "entropy": 0.021978449448943137, "epoch": 0.953125, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.019762400537729263, "learning_rate": 4.699519230769231e-08, "loss": -0.006, "num_tokens": 157353737.0, "reward": 0.6982929706573486, "reward_std": 0.2651403546333313, "rewards/reward_fn/mean": 0.6982929706573486, "rewards/reward_fn/std": 0.2651403546333313, "sampling/importance_sampling_ratio/max": 1.3562353650728862, "sampling/importance_sampling_ratio/mean": 0.49174176653226215, "sampling/importance_sampling_ratio/min": 0.00016110553457338028, "sampling/sampling_logp_difference/max": 2.1603910525639853, "sampling/sampling_logp_difference/mean": 0.004839559396107991, "step": 7930, "step_time": 9.92867163894698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 3000.0, "completions/max_terminated_length": 2703.0, "completions/mean_length": 1180.65625, "completions/mean_terminated_length": 671.239990234375, "completions/min_length": 155.5, "completions/min_terminated_length": 155.5, "entropy": 0.024597014114260674, "epoch": 0.9543269230769231, "frac_reward_zero_std": 0.0, "grad_norm": 0.016359562054276466, "learning_rate": 4.5793269230769224e-08, "loss": -0.014, "num_tokens": 157517491.0, "reward": 0.7128284275531769, "reward_std": 0.2812885046005249, "rewards/reward_fn/mean": 0.7128284275531769, "rewards/reward_fn/std": 0.2812885046005249, "sampling/importance_sampling_ratio/max": 1.1543431282043457, "sampling/importance_sampling_ratio/mean": 0.2942010164260864, "sampling/importance_sampling_ratio/min": 2.801186155920732e-06, "sampling/sampling_logp_difference/max": 3.143549680709839, "sampling/sampling_logp_difference/mean": 0.005800557788461447, "step": 7940, "step_time": 7.315268752723933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2408.6666666666665, "completions/mean_length": 1323.2604166666667, "completions/mean_terminated_length": 669.765625, "completions/min_length": 125.33333333333333, "completions/min_terminated_length": 125.33333333333333, "entropy": 0.026315651834011078, "epoch": 0.9555288461538461, "frac_reward_zero_std": 0.0, "grad_norm": 0.009845824912190437, "learning_rate": 4.4591346153846154e-08, "loss": -0.006, "num_tokens": 157758228.0, "reward": 0.7068402965863546, "reward_std": 0.261968657374382, "rewards/reward_fn/mean": 0.7068402965863546, "rewards/reward_fn/std": 0.2619686673084895, "sampling/importance_sampling_ratio/max": 1.2599780758221943, "sampling/importance_sampling_ratio/mean": 0.28351401289304096, "sampling/importance_sampling_ratio/min": 1.6079509805422276e-05, "sampling/sampling_logp_difference/max": 2.6696592966715493, "sampling/sampling_logp_difference/mean": 0.00536598963662982, "step": 7950, "step_time": 10.458813178539277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 3000.0, "completions/max_terminated_length": 1751.5, "completions/mean_length": 895.671875, "completions/mean_terminated_length": 457.83570861816406, "completions/min_length": 110.5, "completions/min_terminated_length": 110.5, "entropy": 0.02570444270968437, "epoch": 0.9567307692307693, "frac_reward_zero_std": 0.0, "grad_norm": 0.017755696550011635, "learning_rate": 4.338942307692307e-08, "loss": 0.008, "num_tokens": 157877407.0, "reward": 0.7796018123626709, "reward_std": 0.251366451382637, "rewards/reward_fn/mean": 0.7796018123626709, "rewards/reward_fn/std": 0.2513664439320564, "sampling/importance_sampling_ratio/max": 1.6693899035453796, "sampling/importance_sampling_ratio/mean": 0.4380089193582535, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.6506847739219666, "sampling/sampling_logp_difference/mean": 0.006448759464547038, "step": 7960, "step_time": 6.832136713992805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.10416666666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 2423.6666666666665, "completions/mean_length": 867.9791666666666, "completions/mean_terminated_length": 618.5428771972656, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.029731352627277375, "epoch": 0.9579326923076923, "frac_reward_zero_std": 0.0, "grad_norm": 0.012003275565803051, "learning_rate": 4.21875e-08, "loss": 0.012, "num_tokens": 158079925.0, "reward": 0.8479775190353394, "reward_std": 0.16917998840411505, "rewards/reward_fn/mean": 0.8479775190353394, "rewards/reward_fn/std": 0.1691799908876419, "sampling/importance_sampling_ratio/max": 1.3570435444513957, "sampling/importance_sampling_ratio/mean": 0.35735302170117694, "sampling/importance_sampling_ratio/min": 1.0230491625407012e-05, "sampling/sampling_logp_difference/max": 1.5452421108881633, "sampling/sampling_logp_difference/mean": 0.006709124427288771, "step": 7970, "step_time": 9.95943385027349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2398.0, "completions/mean_length": 1166.234375, "completions/mean_terminated_length": 614.8152313232422, "completions/min_length": 160.5, "completions/min_terminated_length": 160.5, "entropy": 0.02583100814372301, "epoch": 0.9591346153846154, "frac_reward_zero_std": 0.0, "grad_norm": 0.010062092915177345, "learning_rate": 4.098557692307692e-08, "loss": -0.0065, "num_tokens": 158232652.0, "reward": 0.7360019087791443, "reward_std": 0.24507630616426468, "rewards/reward_fn/mean": 0.7360019087791443, "rewards/reward_fn/std": 0.24507630616426468, "sampling/importance_sampling_ratio/max": 1.2497612833976746, "sampling/importance_sampling_ratio/mean": 0.3129016160964966, "sampling/importance_sampling_ratio/min": 4.7237135277100606e-05, "sampling/sampling_logp_difference/max": 2.472673177719116, "sampling/sampling_logp_difference/mean": 0.005903882905840874, "step": 7980, "step_time": 7.119847309403122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 3000.0, "completions/max_terminated_length": 1911.6666666666667, "completions/mean_length": 1117.2604166666667, "completions/mean_terminated_length": 407.8651835123698, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.023843961395323278, "epoch": 0.9603365384615384, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.016372613608837128, "learning_rate": 3.978365384615384e-08, "loss": -0.0009, "num_tokens": 158452029.0, "reward": 0.6533395648002625, "reward_std": 0.2892807126045227, "rewards/reward_fn/mean": 0.6533395648002625, "rewards/reward_fn/std": 0.289280707637469, "sampling/importance_sampling_ratio/max": 1.2954025665918987, "sampling/importance_sampling_ratio/mean": 0.41602983077367145, "sampling/importance_sampling_ratio/min": 6.693314209845387e-06, "sampling/sampling_logp_difference/max": 2.3263783852259317, "sampling/sampling_logp_difference/mean": 0.005348416821410258, "step": 7990, "step_time": 10.141532607469708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 3000.0, "completions/max_terminated_length": 1917.0, "completions/mean_length": 1013.796875, "completions/mean_terminated_length": 458.95835876464844, "completions/min_length": 120.5, "completions/min_terminated_length": 120.5, "entropy": 0.026402413472533227, "epoch": 0.9615384615384616, "frac_reward_zero_std": 0.0, "grad_norm": 0.008746017701923847, "learning_rate": 3.858173076923077e-08, "loss": -0.0008, "num_tokens": 158588248.0, "reward": 0.7671359181404114, "reward_std": 0.2562745586037636, "rewards/reward_fn/mean": 0.7671359181404114, "rewards/reward_fn/std": 0.2562745735049248, "sampling/importance_sampling_ratio/max": 1.516546905040741, "sampling/importance_sampling_ratio/mean": 0.34584271907806396, "sampling/importance_sampling_ratio/min": 0.00014566548634320498, "sampling/sampling_logp_difference/max": 3.6701226234436035, "sampling/sampling_logp_difference/mean": 0.005926941055804491, "step": 8000, "step_time": 6.827609028853476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3645833333333333, "completions/max_length": 3000.0, "completions/max_terminated_length": 1949.0, "completions/mean_length": 1426.28125, "completions/mean_terminated_length": 526.5271809895834, "completions/min_length": 150.33333333333334, "completions/min_terminated_length": 150.33333333333334, "entropy": 0.020076114498078824, "epoch": 0.9627403846153846, "frac_reward_zero_std": 0.0, "grad_norm": 0.00550507428124547, "learning_rate": 3.737980769230769e-08, "loss": 0.0073, "num_tokens": 158833427.0, "reward": 0.6483802596728007, "reward_std": 0.3145770877599716, "rewards/reward_fn/mean": 0.6483802596728007, "rewards/reward_fn/std": 0.3145770827929179, "sampling/importance_sampling_ratio/max": 1.4142675399780273, "sampling/importance_sampling_ratio/mean": 0.3869028488794963, "sampling/importance_sampling_ratio/min": 8.942030581238214e-05, "sampling/sampling_logp_difference/max": 2.9211107889811196, "sampling/sampling_logp_difference/mean": 0.004811267135664821, "step": 8010, "step_time": 10.26985913356766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 3000.0, "completions/max_terminated_length": 2199.5, "completions/mean_length": 1153.328125, "completions/mean_terminated_length": 660.2175750732422, "completions/min_length": 142.5, "completions/min_terminated_length": 142.5, "entropy": 0.024467202462255956, "epoch": 0.9639423076923077, "frac_reward_zero_std": 0.0, "grad_norm": 0.008723726496100426, "learning_rate": 3.6177884615384614e-08, "loss": -0.0047, "num_tokens": 158994216.0, "reward": 0.7615698277950287, "reward_std": 0.25390321016311646, "rewards/reward_fn/mean": 0.7615698277950287, "rewards/reward_fn/std": 0.25390321016311646, "sampling/importance_sampling_ratio/max": 2.0289029479026794, "sampling/importance_sampling_ratio/mean": 0.35813675075769424, "sampling/importance_sampling_ratio/min": 1.9632835801530746e-06, "sampling/sampling_logp_difference/max": 3.0456278324127197, "sampling/sampling_logp_difference/mean": 0.006146574625745416, "step": 8020, "step_time": 7.104568145889789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3020833333333333, "completions/max_length": 3000.0, "completions/max_terminated_length": 2497.0, "completions/mean_length": 1241.7395833333333, "completions/mean_terminated_length": 492.2425944010417, "completions/min_length": 126.33333333333333, "completions/min_terminated_length": 126.33333333333333, "entropy": 0.025916178710758686, "epoch": 0.9651442307692307, "frac_reward_zero_std": 0.0, "grad_norm": 0.005508045665919781, "learning_rate": 3.497596153846154e-08, "loss": -0.0048, "num_tokens": 159242975.0, "reward": 0.7301279902458191, "reward_std": 0.2316205451885859, "rewards/reward_fn/mean": 0.7301279902458191, "rewards/reward_fn/std": 0.23162053028742471, "sampling/importance_sampling_ratio/max": 1.4226386547088623, "sampling/importance_sampling_ratio/mean": 0.3272657146056493, "sampling/importance_sampling_ratio/min": 7.858285471229465e-06, "sampling/sampling_logp_difference/max": 3.816377321879069, "sampling/sampling_logp_difference/mean": 0.005431069216380517, "step": 8030, "step_time": 10.540303995274007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3000.0, "completions/max_terminated_length": 2076.0, "completions/mean_length": 1263.515625, "completions/mean_terminated_length": 684.6875, "completions/min_length": 104.5, "completions/min_terminated_length": 104.5, "entropy": 0.026316511444747448, "epoch": 0.9663461538461539, "frac_reward_zero_std": 0.0, "grad_norm": 0.015495823696255684, "learning_rate": 3.377403846153846e-08, "loss": -0.0101, "num_tokens": 159412776.0, "reward": 0.7359975874423981, "reward_std": 0.2684158533811569, "rewards/reward_fn/mean": 0.7359975874423981, "rewards/reward_fn/std": 0.2684158608317375, "sampling/importance_sampling_ratio/max": 1.4361955523490906, "sampling/importance_sampling_ratio/mean": 0.29910267144441605, "sampling/importance_sampling_ratio/min": 3.762532878681668e-05, "sampling/sampling_logp_difference/max": 1.6421390771865845, "sampling/sampling_logp_difference/mean": 0.005722710397094488, "step": 8040, "step_time": 7.198765543103218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3020833333333333, "completions/max_length": 3000.0, "completions/max_terminated_length": 936.3333333333334, "completions/mean_length": 1159.25, "completions/mean_terminated_length": 361.23463948567706, "completions/min_length": 117.33333333333333, "completions/min_terminated_length": 117.33333333333333, "entropy": 0.024431832320988178, "epoch": 0.9675480769230769, "frac_reward_zero_std": 0.0, "grad_norm": 0.003818425117060542, "learning_rate": 3.257211538461538e-08, "loss": -0.0077, "num_tokens": 159644160.0, "reward": 0.6729174455006918, "reward_std": 0.28954016665617627, "rewards/reward_fn/mean": 0.6729174455006918, "rewards/reward_fn/std": 0.28954016665617627, "sampling/importance_sampling_ratio/max": 1.5548736254374187, "sampling/importance_sampling_ratio/mean": 0.3477560877799988, "sampling/importance_sampling_ratio/min": 6.875976282572083e-05, "sampling/sampling_logp_difference/max": 3.51050595442454, "sampling/sampling_logp_difference/mean": 0.005632845529665549, "step": 8050, "step_time": 10.32230165451765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 3000.0, "completions/max_terminated_length": 1960.5, "completions/mean_length": 1067.0, "completions/mean_terminated_length": 582.1770172119141, "completions/min_length": 181.5, "completions/min_terminated_length": 181.5, "entropy": 0.022261154279112814, "epoch": 0.96875, "frac_reward_zero_std": 0.125, "grad_norm": 0.007268994580954313, "learning_rate": 3.1370192307692306e-08, "loss": -0.0011, "num_tokens": 159800472.0, "reward": 0.6674293577671051, "reward_std": 0.3324936628341675, "rewards/reward_fn/mean": 0.6674293577671051, "rewards/reward_fn/std": 0.3324936777353287, "sampling/importance_sampling_ratio/max": 1.7337409853935242, "sampling/importance_sampling_ratio/mean": 0.3950204849243164, "sampling/importance_sampling_ratio/min": 0.0002918313257396221, "sampling/sampling_logp_difference/max": 2.0205233097076416, "sampling/sampling_logp_difference/mean": 0.00423056841827929, "step": 8060, "step_time": 7.257623486965895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.14583333333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 2162.6666666666665, "completions/mean_length": 905.6145833333334, "completions/mean_terminated_length": 550.3548075358073, "completions/min_length": 110.66666666666667, "completions/min_terminated_length": 110.66666666666667, "entropy": 0.027503027208149435, "epoch": 0.9699519230769231, "frac_reward_zero_std": 0.0, "grad_norm": 0.00784021895378828, "learning_rate": 3.016826923076923e-08, "loss": -0.0083, "num_tokens": 159998491.0, "reward": 0.7116446495056152, "reward_std": 0.2924923002719879, "rewards/reward_fn/mean": 0.7116446495056152, "rewards/reward_fn/std": 0.29249229033788043, "sampling/importance_sampling_ratio/max": 2.193974018096924, "sampling/importance_sampling_ratio/mean": 0.48140762249628705, "sampling/importance_sampling_ratio/min": 0.00010179366654483601, "sampling/sampling_logp_difference/max": 2.0109497706095376, "sampling/sampling_logp_difference/mean": 0.006411832136412461, "step": 8070, "step_time": 10.058602398261428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 3000.0, "completions/max_terminated_length": 2337.5, "completions/mean_length": 956.6875, "completions/mean_terminated_length": 483.97776794433594, "completions/min_length": 125.5, "completions/min_terminated_length": 125.5, "entropy": 0.025247685424983502, "epoch": 0.9711538461538461, "frac_reward_zero_std": 0.0, "grad_norm": 0.003888274310156703, "learning_rate": 2.8966346153846155e-08, "loss": -0.012, "num_tokens": 160133623.0, "reward": 0.7973791062831879, "reward_std": 0.2056639865040779, "rewards/reward_fn/mean": 0.7973791062831879, "rewards/reward_fn/std": 0.20566397905349731, "sampling/importance_sampling_ratio/max": 2.124198853969574, "sampling/importance_sampling_ratio/mean": 0.49724212288856506, "sampling/importance_sampling_ratio/min": 7.664021723030601e-05, "sampling/sampling_logp_difference/max": 1.8554974794387817, "sampling/sampling_logp_difference/mean": 0.0053954217582941055, "step": 8080, "step_time": 6.966091001313179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.19791666666666666, "completions/max_length": 3000.0, "completions/max_terminated_length": 1633.0, "completions/mean_length": 994.0520833333334, "completions/mean_terminated_length": 499.4621073404948, "completions/min_length": 111.33333333333333, "completions/min_terminated_length": 111.33333333333333, "entropy": 0.02639772407710552, "epoch": 0.9723557692307693, "frac_reward_zero_std": 0.0, "grad_norm": 0.013680015690624714, "learning_rate": 2.7764423076923074e-08, "loss": -0.0076, "num_tokens": 160360284.0, "reward": 0.7363166411717733, "reward_std": 0.2708418071269989, "rewards/reward_fn/mean": 0.7363166411717733, "rewards/reward_fn/std": 0.2708418071269989, "sampling/importance_sampling_ratio/max": 1.2218357920646667, "sampling/importance_sampling_ratio/mean": 0.312731812397639, "sampling/importance_sampling_ratio/min": 0.00045008173962438985, "sampling/sampling_logp_difference/max": 3.0039265950520835, "sampling/sampling_logp_difference/mean": 0.006421710674961408, "step": 8090, "step_time": 10.638145965337753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2132.0, "completions/mean_length": 1048.5, "completions/mean_terminated_length": 549.7430725097656, "completions/min_length": 134.5, "completions/min_terminated_length": 134.5, "entropy": 0.023573335073888303, "epoch": 0.9735576923076923, "frac_reward_zero_std": 0.0, "grad_norm": 0.008922174572944641, "learning_rate": 2.6562499999999997e-08, "loss": 0.0066, "num_tokens": 160499220.0, "reward": 0.7063708305358887, "reward_std": 0.3024941012263298, "rewards/reward_fn/mean": 0.7063708305358887, "rewards/reward_fn/std": 0.3024941012263298, "sampling/importance_sampling_ratio/max": 1.6134949922561646, "sampling/importance_sampling_ratio/mean": 0.4109546095132828, "sampling/importance_sampling_ratio/min": 0.0007944266544654965, "sampling/sampling_logp_difference/max": 1.959004282951355, "sampling/sampling_logp_difference/mean": 0.005250295624136925, "step": 8100, "step_time": 6.883121698629111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.20833333333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 2287.0, "completions/mean_length": 1008.7083333333334, "completions/mean_terminated_length": 489.7022298177083, "completions/min_length": 123.66666666666667, "completions/min_terminated_length": 123.66666666666667, "entropy": 0.020769100915640593, "epoch": 0.9747596153846154, "frac_reward_zero_std": 0.16666666666666666, "grad_norm": 0.01902693696320057, "learning_rate": 2.5360576923076923e-08, "loss": -0.0018, "num_tokens": 160709928.0, "reward": 0.6479375958442688, "reward_std": 0.33822257320086163, "rewards/reward_fn/mean": 0.6479375958442688, "rewards/reward_fn/std": 0.33822256326675415, "sampling/importance_sampling_ratio/max": 1.5743529399236043, "sampling/importance_sampling_ratio/mean": 0.4629463056723277, "sampling/importance_sampling_ratio/min": 0.00031029227223674144, "sampling/sampling_logp_difference/max": 2.6445189317067466, "sampling/sampling_logp_difference/mean": 0.0044280846292773885, "step": 8110, "step_time": 10.070124500151724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 3000.0, "completions/max_terminated_length": 2392.0, "completions/mean_length": 1345.6875, "completions/mean_terminated_length": 643.6101379394531, "completions/min_length": 145.5, "completions/min_terminated_length": 145.5, "entropy": 0.025301958806812762, "epoch": 0.9759615384615384, "frac_reward_zero_std": 0.0, "grad_norm": 0.008140431717038155, "learning_rate": 2.4158653846153846e-08, "loss": -0.0033, "num_tokens": 160869684.0, "reward": 0.6855023503303528, "reward_std": 0.2697366178035736, "rewards/reward_fn/mean": 0.6855023503303528, "rewards/reward_fn/std": 0.2697366327047348, "sampling/importance_sampling_ratio/max": 1.113320231437683, "sampling/importance_sampling_ratio/mean": 0.30981938540935516, "sampling/importance_sampling_ratio/min": 5.290765921017737e-05, "sampling/sampling_logp_difference/max": 3.427857995033264, "sampling/sampling_logp_difference/mean": 0.005832986440509558, "step": 8120, "step_time": 7.137052455917001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2916666666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 2220.0, "completions/mean_length": 1240.8854166666667, "completions/mean_terminated_length": 534.4685262044271, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.02387373298406601, "epoch": 0.9771634615384616, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.010871346108615398, "learning_rate": 2.295673076923077e-08, "loss": 0.0043, "num_tokens": 161099865.0, "reward": 0.716742753982544, "reward_std": 0.23941849172115326, "rewards/reward_fn/mean": 0.716742753982544, "rewards/reward_fn/std": 0.23941847681999207, "sampling/importance_sampling_ratio/max": 1.1614173452059429, "sampling/importance_sampling_ratio/mean": 0.32445557912190753, "sampling/importance_sampling_ratio/min": 1.715868611196214e-05, "sampling/sampling_logp_difference/max": 3.2644129594167075, "sampling/sampling_logp_difference/mean": 0.005125041740636031, "step": 8130, "step_time": 10.28223267411813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 3000.0, "completions/max_terminated_length": 1686.0, "completions/mean_length": 870.625, "completions/mean_terminated_length": 625.8111267089844, "completions/min_length": 161.5, "completions/min_terminated_length": 161.5, "entropy": 0.025601825304329397, "epoch": 0.9783653846153846, "frac_reward_zero_std": 0.0, "grad_norm": 0.04213299602270126, "learning_rate": 2.1754807692307692e-08, "loss": 0.0022, "num_tokens": 161221065.0, "reward": 0.8015683591365814, "reward_std": 0.21379803121089935, "rewards/reward_fn/mean": 0.8015683591365814, "rewards/reward_fn/std": 0.21379801630973816, "sampling/importance_sampling_ratio/max": 1.2608416676521301, "sampling/importance_sampling_ratio/mean": 0.3908625394105911, "sampling/importance_sampling_ratio/min": 0.0017489528181613423, "sampling/sampling_logp_difference/max": 2.72549170255661, "sampling/sampling_logp_difference/mean": 0.00561427790671587, "step": 8140, "step_time": 6.735845191869885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 3000.0, "completions/max_terminated_length": 1617.0, "completions/mean_length": 1434.8541666666667, "completions/mean_terminated_length": 522.7089131673177, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.023347006365656852, "epoch": 0.9795673076923077, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.003250946057960391, "learning_rate": 2.0552884615384615e-08, "loss": 0.0035, "num_tokens": 161484899.0, "reward": 0.6943393548329672, "reward_std": 0.2696772962808609, "rewards/reward_fn/mean": 0.6943393548329672, "rewards/reward_fn/std": 0.2696772913138072, "sampling/importance_sampling_ratio/max": 1.2238127787907918, "sampling/importance_sampling_ratio/mean": 0.31074892977873486, "sampling/importance_sampling_ratio/min": 1.274307487619808e-05, "sampling/sampling_logp_difference/max": 3.983906348546346, "sampling/sampling_logp_difference/mean": 0.004537191552420457, "step": 8150, "step_time": 10.466524195298552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 3000.0, "completions/max_terminated_length": 1699.5, "completions/mean_length": 1284.28125, "completions/mean_terminated_length": 399.5411682128906, "completions/min_length": 119.5, "completions/min_terminated_length": 119.5, "entropy": 0.022263996303081512, "epoch": 0.9807692307692307, "frac_reward_zero_std": 0.0, "grad_norm": 0.010139893740415573, "learning_rate": 1.9350961538461538e-08, "loss": -0.0061, "num_tokens": 161650221.0, "reward": 0.6944399774074554, "reward_std": 0.28809164464473724, "rewards/reward_fn/mean": 0.6944399774074554, "rewards/reward_fn/std": 0.28809164464473724, "sampling/importance_sampling_ratio/max": 1.510222315788269, "sampling/importance_sampling_ratio/mean": 0.39948415756225586, "sampling/importance_sampling_ratio/min": 1.988138865272049e-05, "sampling/sampling_logp_difference/max": 4.14191746711731, "sampling/sampling_logp_difference/mean": 0.005016436101868749, "step": 8160, "step_time": 7.1678809902630745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2002.6666666666667, "completions/mean_length": 1421.1354166666667, "completions/mean_terminated_length": 595.588623046875, "completions/min_length": 165.33333333333334, "completions/min_terminated_length": 165.33333333333334, "entropy": 0.021346988063305615, "epoch": 0.9819711538461539, "frac_reward_zero_std": 0.0, "grad_norm": 0.021745797246694565, "learning_rate": 1.814903846153846e-08, "loss": 0.0017, "num_tokens": 161888586.0, "reward": 0.6708402633666992, "reward_std": 0.2885885685682297, "rewards/reward_fn/mean": 0.6708402633666992, "rewards/reward_fn/std": 0.2885885536670685, "sampling/importance_sampling_ratio/max": 1.5349178314208984, "sampling/importance_sampling_ratio/mean": 0.3394276400407155, "sampling/importance_sampling_ratio/min": 0.00020578883171159154, "sampling/sampling_logp_difference/max": 1.782108465830485, "sampling/sampling_logp_difference/mean": 0.004644213709980249, "step": 8170, "step_time": 10.31346586169675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 3000.0, "completions/max_terminated_length": 1465.0, "completions/mean_length": 1247.765625, "completions/mean_terminated_length": 576.9647521972656, "completions/min_length": 144.5, "completions/min_terminated_length": 144.5, "entropy": 0.022513503767549993, "epoch": 0.9831730769230769, "frac_reward_zero_std": 0.0, "grad_norm": 0.005608225706964731, "learning_rate": 1.6947115384615383e-08, "loss": 0.0065, "num_tokens": 162017267.0, "reward": 0.7340684533119202, "reward_std": 0.2423768937587738, "rewards/reward_fn/mean": 0.7340684533119202, "rewards/reward_fn/std": 0.2423768788576126, "sampling/importance_sampling_ratio/max": 1.9951855540275574, "sampling/importance_sampling_ratio/mean": 0.3159673810005188, "sampling/importance_sampling_ratio/min": 1.557167797727743e-05, "sampling/sampling_logp_difference/max": 3.077336549758911, "sampling/sampling_logp_difference/mean": 0.005623812787234783, "step": 8180, "step_time": 6.956054246984422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 3000.0, "completions/max_terminated_length": 1971.6666666666667, "completions/mean_length": 1320.3229166666667, "completions/mean_terminated_length": 691.5526326497396, "completions/min_length": 150.33333333333334, "completions/min_terminated_length": 150.33333333333334, "entropy": 0.027668577060103415, "epoch": 0.984375, "frac_reward_zero_std": 0.0, "grad_norm": 0.007157952059060335, "learning_rate": 1.5745192307692306e-08, "loss": -0.0014, "num_tokens": 162258618.0, "reward": 0.7455661296844482, "reward_std": 0.21348504722118378, "rewards/reward_fn/mean": 0.7455661296844482, "rewards/reward_fn/std": 0.21348503728707632, "sampling/importance_sampling_ratio/max": 1.0128373503684998, "sampling/importance_sampling_ratio/mean": 0.22597470879554749, "sampling/importance_sampling_ratio/min": 0.0001150378601172027, "sampling/sampling_logp_difference/max": 1.762477437655131, "sampling/sampling_logp_difference/mean": 0.006163390818983316, "step": 8190, "step_time": 10.256314385682344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2430.0, "completions/mean_length": 1038.140625, "completions/mean_terminated_length": 533.5194396972656, "completions/min_length": 147.5, "completions/min_terminated_length": 147.5, "entropy": 0.028159769624471663, "epoch": 0.9855769230769231, "frac_reward_zero_std": 0.125, "grad_norm": 0.0035978544037789106, "learning_rate": 1.4543269230769231e-08, "loss": -0.0033, "num_tokens": 162399131.0, "reward": 0.767893522977829, "reward_std": 0.22440028935670853, "rewards/reward_fn/mean": 0.767893522977829, "rewards/reward_fn/std": 0.22440029680728912, "sampling/importance_sampling_ratio/max": 1.2225104570388794, "sampling/importance_sampling_ratio/mean": 0.3392341434955597, "sampling/importance_sampling_ratio/min": 1.4327198496817677e-06, "sampling/sampling_logp_difference/max": 10.10045051574707, "sampling/sampling_logp_difference/mean": 0.006619712570682168, "step": 8200, "step_time": 7.024518130160868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.17708333333333334, "completions/max_length": 3000.0, "completions/max_terminated_length": 2345.3333333333335, "completions/mean_length": 907.8958333333334, "completions/mean_terminated_length": 458.47144571940106, "completions/min_length": 123.33333333333333, "completions/min_terminated_length": 123.33333333333333, "entropy": 0.02196157332509756, "epoch": 0.9867788461538461, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.022132476791739464, "learning_rate": 1.3341346153846152e-08, "loss": 0.0078, "num_tokens": 162586721.0, "reward": 0.6617215176423391, "reward_std": 0.27937943240006763, "rewards/reward_fn/mean": 0.6617215176423391, "rewards/reward_fn/std": 0.2793794473012288, "sampling/importance_sampling_ratio/max": 2.087843338648478, "sampling/importance_sampling_ratio/mean": 0.5417542457580566, "sampling/importance_sampling_ratio/min": 5.605665319308173e-05, "sampling/sampling_logp_difference/max": 1.5358097155888875, "sampling/sampling_logp_difference/mean": 0.0052420722010234995, "step": 8210, "step_time": 10.075082119740546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2122.5, "completions/mean_length": 1019.6875, "completions/mean_terminated_length": 414.6112060546875, "completions/min_length": 116.5, "completions/min_terminated_length": 116.5, "entropy": 0.020381511747837068, "epoch": 0.9879807692307693, "frac_reward_zero_std": 0.125, "grad_norm": 0.0047319624572992325, "learning_rate": 1.2139423076923077e-08, "loss": -0.003, "num_tokens": 162720485.0, "reward": 0.6422277390956879, "reward_std": 0.32441940903663635, "rewards/reward_fn/mean": 0.6422277390956879, "rewards/reward_fn/std": 0.32441939413547516, "sampling/importance_sampling_ratio/max": 1.4153637290000916, "sampling/importance_sampling_ratio/mean": 0.5034635961055756, "sampling/importance_sampling_ratio/min": 0.00024694002786418423, "sampling/sampling_logp_difference/max": 2.029874324798584, "sampling/sampling_logp_difference/mean": 0.004414177383296192, "step": 8220, "step_time": 7.18777135098353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2604166666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 1512.6666666666667, "completions/mean_length": 1082.6875, "completions/mean_terminated_length": 418.994140625, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.023536817543208598, "epoch": 0.9891826923076923, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.006671891547739506, "learning_rate": 1.09375e-08, "loss": 0.0102, "num_tokens": 162916823.0, "reward": 0.7582734823226929, "reward_std": 0.24212215840816498, "rewards/reward_fn/mean": 0.7582734823226929, "rewards/reward_fn/std": 0.24212215344111124, "sampling/importance_sampling_ratio/max": 2.3637179931004844, "sampling/importance_sampling_ratio/mean": 0.5394408504168192, "sampling/importance_sampling_ratio/min": 0.00011783768170895807, "sampling/sampling_logp_difference/max": 3.316054105758667, "sampling/sampling_logp_difference/mean": 0.004977632313966751, "step": 8230, "step_time": 10.106529659032821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2063.5, "completions/mean_length": 1289.71875, "completions/mean_terminated_length": 845.0463256835938, "completions/min_length": 242.5, "completions/min_terminated_length": 242.5, "entropy": 0.022705166414380074, "epoch": 0.9903846153846154, "frac_reward_zero_std": 0.0, "grad_norm": 0.020595470443367958, "learning_rate": 9.735576923076922e-09, "loss": -0.0008, "num_tokens": 163073909.0, "reward": 0.6669546961784363, "reward_std": 0.30518828332424164, "rewards/reward_fn/mean": 0.6669546961784363, "rewards/reward_fn/std": 0.30518826842308044, "sampling/importance_sampling_ratio/max": 0.928863525390625, "sampling/importance_sampling_ratio/mean": 0.23549267649650574, "sampling/importance_sampling_ratio/min": 8.644025547255296e-05, "sampling/sampling_logp_difference/max": 1.8129093647003174, "sampling/sampling_logp_difference/mean": 0.005249444395303726, "step": 8240, "step_time": 6.964328511804342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3854166666666667, "completions/max_length": 3000.0, "completions/max_terminated_length": 2270.6666666666665, "completions/mean_length": 1506.21875, "completions/mean_terminated_length": 639.4557291666666, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "entropy": 0.025183646753430367, "epoch": 0.9915865384615384, "frac_reward_zero_std": 0.0, "grad_norm": 0.009804450906813145, "learning_rate": 8.533653846153845e-09, "loss": 0.0022, "num_tokens": 163350178.0, "reward": 0.7199353774388632, "reward_std": 0.1898240844408671, "rewards/reward_fn/mean": 0.7199353774388632, "rewards/reward_fn/std": 0.18982407947381338, "sampling/importance_sampling_ratio/max": 1.5073432524998982, "sampling/importance_sampling_ratio/mean": 0.2341568867365519, "sampling/importance_sampling_ratio/min": 4.2745235835658e-05, "sampling/sampling_logp_difference/max": 2.5499931971232095, "sampling/sampling_logp_difference/mean": 0.005691771240284045, "step": 8250, "step_time": 10.208579357340932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2512.5, "completions/mean_length": 1065.796875, "completions/mean_terminated_length": 580.1620483398438, "completions/min_length": 136.5, "completions/min_terminated_length": 136.5, "entropy": 0.025158329866826534, "epoch": 0.9927884615384616, "frac_reward_zero_std": 0.0, "grad_norm": 0.015638601034879684, "learning_rate": 7.331730769230769e-09, "loss": -0.0012, "num_tokens": 163501741.0, "reward": 0.7052009105682373, "reward_std": 0.27017562091350555, "rewards/reward_fn/mean": 0.7052009105682373, "rewards/reward_fn/std": 0.27017561346292496, "sampling/importance_sampling_ratio/max": 1.1497248411178589, "sampling/importance_sampling_ratio/mean": 0.3817374110221863, "sampling/importance_sampling_ratio/min": 2.9298012577783084e-05, "sampling/sampling_logp_difference/max": 1.7793943285942078, "sampling/sampling_logp_difference/mean": 0.005692372564226389, "step": 8260, "step_time": 7.1402087402530015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 3000.0, "completions/max_terminated_length": 1241.6666666666667, "completions/mean_length": 972.7291666666666, "completions/mean_terminated_length": 395.51824951171875, "completions/min_length": 144.33333333333334, "completions/min_terminated_length": 144.33333333333334, "entropy": 0.024896788969635965, "epoch": 0.9939903846153846, "frac_reward_zero_std": 0.0, "grad_norm": 0.013585871085524559, "learning_rate": 6.129807692307692e-09, "loss": -0.0052, "num_tokens": 163698851.0, "reward": 0.7633165319760641, "reward_std": 0.25857049226760864, "rewards/reward_fn/mean": 0.7633165319760641, "rewards/reward_fn/std": 0.25857048233350116, "sampling/importance_sampling_ratio/max": 2.0492610136667886, "sampling/importance_sampling_ratio/mean": 0.4887112081050873, "sampling/importance_sampling_ratio/min": 0.0007421194918606488, "sampling/sampling_logp_difference/max": 1.960729996363322, "sampling/sampling_logp_difference/mean": 0.0056888647377491, "step": 8270, "step_time": 9.96052526999265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 3000.0, "completions/max_terminated_length": 2062.5, "completions/mean_length": 1084.0, "completions/mean_terminated_length": 500.1940002441406, "completions/min_length": 125.5, "completions/min_terminated_length": 125.5, "entropy": 0.023938989080488682, "epoch": 0.9951923076923077, "frac_reward_zero_std": 0.0, "grad_norm": 0.02789430133998394, "learning_rate": 4.927884615384615e-09, "loss": 0.0003, "num_tokens": 163841915.0, "reward": 0.7220236361026764, "reward_std": 0.28312505781650543, "rewards/reward_fn/mean": 0.7220236361026764, "rewards/reward_fn/std": 0.2831250727176666, "sampling/importance_sampling_ratio/max": 1.187456488609314, "sampling/importance_sampling_ratio/mean": 0.39809948205947876, "sampling/importance_sampling_ratio/min": 0.0001645746947360749, "sampling/sampling_logp_difference/max": 1.7650976777076721, "sampling/sampling_logp_difference/mean": 0.0048763332888484, "step": 8280, "step_time": 6.981449053809047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 3000.0, "completions/max_terminated_length": 2327.3333333333335, "completions/mean_length": 1314.4895833333333, "completions/mean_terminated_length": 654.9420471191406, "completions/min_length": 149.66666666666666, "completions/min_terminated_length": 149.66666666666666, "entropy": 0.025225123204290868, "epoch": 0.9963942307692307, "frac_reward_zero_std": 0.0, "grad_norm": 0.004508044105023146, "learning_rate": 3.7259615384615384e-09, "loss": -0.004, "num_tokens": 164057898.0, "reward": 0.7578553954760233, "reward_std": 0.23351742327213287, "rewards/reward_fn/mean": 0.7578553954760233, "rewards/reward_fn/std": 0.23351742823918661, "sampling/importance_sampling_ratio/max": 1.3072961568832397, "sampling/importance_sampling_ratio/mean": 0.2946525365114212, "sampling/importance_sampling_ratio/min": 7.973848672312064e-06, "sampling/sampling_logp_difference/max": 3.8771543502807617, "sampling/sampling_logp_difference/mean": 0.005841690891732772, "step": 8290, "step_time": 10.384404334891588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 2589.5, "completions/max_terminated_length": 1984.0, "completions/mean_length": 935.46875, "completions/mean_terminated_length": 441.6932678222656, "completions/min_length": 117.5, "completions/min_terminated_length": 117.5, "entropy": 0.021875908598303795, "epoch": 0.9975961538461539, "frac_reward_zero_std": 0.125, "grad_norm": 0.031564466655254364, "learning_rate": 2.5240384615384617e-09, "loss": 0.0112, "num_tokens": 164194504.0, "reward": 0.7295878529548645, "reward_std": 0.2222772315144539, "rewards/reward_fn/mean": 0.7295878529548645, "rewards/reward_fn/std": 0.2222772166132927, "sampling/importance_sampling_ratio/max": 1.385953664779663, "sampling/importance_sampling_ratio/mean": 0.46730223298072815, "sampling/importance_sampling_ratio/min": 0.0009655660588805404, "sampling/sampling_logp_difference/max": 1.9711028337478638, "sampling/sampling_logp_difference/mean": 0.004709738539531827, "step": 8300, "step_time": 6.02900920053944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 3000.0, "completions/max_terminated_length": 1811.0, "completions/mean_length": 1001.6354166666666, "completions/mean_terminated_length": 618.5101928710938, "completions/min_length": 139.66666666666666, "completions/min_terminated_length": 139.66666666666666, "entropy": 0.02167619429528713, "epoch": 0.9987980769230769, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.02248666249215603, "learning_rate": 1.3221153846153846e-09, "loss": 0.0107, "num_tokens": 164376565.0, "reward": 0.7219341198603312, "reward_std": 0.2900700519482295, "rewards/reward_fn/mean": 0.7219341198603312, "rewards/reward_fn/std": 0.2900700519482295, "sampling/importance_sampling_ratio/max": 1.7225927511850994, "sampling/importance_sampling_ratio/mean": 0.46292125185330707, "sampling/importance_sampling_ratio/min": 0.00019390327757188666, "sampling/sampling_logp_difference/max": 1.9071723620096843, "sampling/sampling_logp_difference/mean": 0.005002560870101054, "step": 8310, "step_time": 10.140967719163745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 3000.0, "completions/max_terminated_length": 1876.5, "completions/mean_length": 910.03125, "completions/mean_terminated_length": 476.5726623535156, "completions/min_length": 153.5, "completions/min_terminated_length": 153.5, "entropy": 0.025047704949975012, "epoch": 1.0, "frac_reward_zero_std": 0.0, "grad_norm": 0.01570754125714302, "learning_rate": 1.201923076923077e-10, "loss": 0.0548, "num_tokens": 164498815.0, "reward": 0.8129001259803772, "reward_std": 0.2249763309955597, "rewards/reward_fn/mean": 0.8129001259803772, "rewards/reward_fn/std": 0.2249763160943985, "sampling/importance_sampling_ratio/max": 1.6661062240600586, "sampling/importance_sampling_ratio/mean": 0.42729660868644714, "sampling/importance_sampling_ratio/min": 0.00020159761936611176, "sampling/sampling_logp_difference/max": 3.5635961294174194, "sampling/sampling_logp_difference/mean": 0.0056295504327863455, "step": 8320, "step_time": 6.8313975523225965 } ], "logging_steps": 10, "max_steps": 8320, "num_input_tokens_seen": 164498815, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }